Merge pull request #201 from data61/feature/#198

independent dropout, refactor out MAP, closes #198
gradientinstitute · Aug 28, 2018 · 12c6503 · 12c6503
2 parents 7429a03 + b84a645
commit 12c6503
Show file tree

Hide file tree

Showing 10 changed files with 96 additions and 79 deletions.
diff --git a/aboleth/__init__.py b/aboleth/__init__.py
@@ -4,8 +4,8 @@
 from .losses import elbo, max_posterior
 from .baselayers import stack
 from .layers import (Activation, DropOut, MaxPool2D, Flatten, DenseVariational,
-                     EmbedVariational, Conv2DVariational, DenseMAP, EmbedMAP,
-                     Conv2DMAP, InputLayer, RandomFourier, RandomArcCosine)
+                     EmbedVariational, Conv2DVariational, Dense, Embed, Conv2D,
+                     InputLayer, RandomFourier, RandomArcCosine)
 from .hlayers import Concat, Sum, PerFeature
 from .impute import (MaskInputLayer, MeanImpute, FixedNormalImpute,
                      LearnedScalarImpute, FixedScalarImpute,
@@ -28,9 +28,9 @@
     'Conv2DVariational',
     'DenseVariational',
     'EmbedVariational',
-    'Conv2DMAP',
-    'DenseMAP',
-    'EmbedMAP',
+    'Conv2D',
+    'Dense',
+    'Embed',
     'RandomFourier',
     'RandomArcCosine',
     'norm_prior',

diff --git a/aboleth/layers.py b/aboleth/layers.py
@@ -119,11 +119,11 @@ def __call__(self, X):
         Net, KL = super(SampleLayer3, self).__call__(X)
         return Net, KL
 
+
 #
 # Activation and Transformation Layers
 #
 
-
 class Activation(Layer):
     """Activation function layer.
 
@@ -156,30 +156,45 @@ class DropOut(Layer):
     keep_prob : float, Tensor
         the probability of keeping an input. See `tf.dropout
         <https://www.tensorflow.org/api_docs/python/tf/nn/dropout>`_.
+    independent: bool
+        Use independently sampled droput for each observation if ``True``. This
+        may dramatically increase convergence, but will no longer only sample
+        the latent function.
     observation_axis : int
         The axis that indexes the observations (``N``). This will assume the
         obserations are on the *second* axis, i.e. ``(n_samples, N, ...)``.
         This is so we can repeat the dropout pattern over observations, which
         has the effect of dropping out weights consistently, thereby sampling
-        the "latent function" of the layer.
+        the "latent function" of the layer. This is only active if
+        ``independent`` is set to ``False``.
     alpha : bool
         Use alpha dropout (tf.contrib.nn.alpha_dropout) that maintains the self
         normalising property of SNNs.
 
+    Note
+    ----
+    If a more complex noise shape, or some other modification to dropout is
+    required, you can use an Activation layer. E.g.
+    ``ab.Activation(lambda x: tf.nn.dropout(x, **your_args))``.
+
     """
 
-    def __init__(self, keep_prob, observation_axis=1, alpha=False):
+    def __init__(self, keep_prob, independent=True, observation_axis=1,
+                 alpha=False):
         """Create an instance of a Dropout layer."""
         self.keep_prob = keep_prob
         self.obsax = observation_axis
+        self.independent = independent
         self.dropout = tf.contrib.nn.alpha_dropout if alpha else tf.nn.dropout
 
     def _build(self, X):
         """Build the graph of this layer."""
         # Set noise shape to equivalent to different samples from posterior
         # i.e. share the samples along the data-observations axis
-        noise_shape = tf.concat([tf.shape(X)[:self.obsax], [1],
-                                 tf.shape(X)[(self.obsax + 1):]], axis=0)
+        noise_shape = None
+        if not self.independent:
+            noise_shape = tf.concat([tf.shape(X)[:self.obsax], [1],
+                                     tf.shape(X)[(self.obsax + 1):]], axis=0)
         Net = self.dropout(X, self.keep_prob, noise_shape, seed=next(seedgen))
         KL = 0.
         return Net, KL
@@ -687,11 +702,11 @@ def _build(self, X):
         return Net, KL
 
 
-class Conv2DMAP(SampleLayer):
-    r"""A 2D convolution layer, with maximum a posteriori (MAP) inference.
+class Conv2D(SampleLayer):
+    r"""A 2D convolution layer.
 
-    This layer uses maximum *a-posteriori* inference to learn the
-    convolutional kernels and biases, and so also returns complexity
+    This layer uses maximum likelihood or maximum *a-posteriori* inference to
+    learn the convolutional kernels and biases, and so also returns complexity
     penalities (l1 or l2) for the weights and biases.
 
     Parameters
@@ -771,8 +786,8 @@ def _weight_shapes(self, channels):
         return weight_shape, bias_shape
 
 
-class DenseMAP(SampleLayer):
-    r"""Dense (fully connected) linear layer, with MAP inference.
+class Dense(SampleLayer):
+    r"""Dense (fully connected) linear layer.
 
     This implements a linear layer, and when called returns
 
@@ -782,8 +797,9 @@ class DenseMAP(SampleLayer):
     where :math:`\mathbf{X} \in \mathbb{R}^{N \times D_{in}}`,
     :math:`\mathbf{W} \in \mathbb{R}^{D_{in} \times D_{out}}` and
     :math:`\mathbf{b} \in \mathbb{R}^{D_{out}}`. This layer uses maximum
-    *a-posteriori* inference to learn the weights and biases, and so also
-    returns complexity penalities (l1 or l2) for the weights and biases.
+    likelihood or maximum *a-posteriori* inference to learn the weights and
+    biases, and so also returns complexity penalities (l1 or l2) for the
+    weights and biases.
 
     Parameters
     ----------
@@ -807,7 +823,7 @@ class DenseMAP(SampleLayer):
 
     def __init__(self, output_dim, l1_reg=0., l2_reg=0., use_bias=True,
                  init_fn='glorot'):
-        """Create and instance of a dense layer with MAP regularizers."""
+        """Create and instance of a dense layer with regularizers."""
         self.output_dim = output_dim
         self.l1 = l1_reg
         self.l2 = l2_reg
@@ -841,8 +857,8 @@ def _build(self, X):
         return Net, penalty
 
 
-class EmbedMAP(SampleLayer3):
-    r"""Dense (fully connected) embedding layer, with MAP inference.
+class Embed(SampleLayer3):
+    r"""Dense (fully connected) embedding layer.
 
     This layer works directly inputs of *K* category *indices* rather than
     one-hot representations, for efficiency. Each column of the input is
@@ -855,8 +871,9 @@ class EmbedMAP(SampleLayer3):
     Here :math:`\mathbf{X} \in \mathbb{N}_2^{N \times K}` and :math:`\mathbf{W}
     \in \mathbb{R}^{K \times D_{out}}`. Though in code we represent
     :math:`\mathbf{X}` as a vector of indices in :math:`\mathbb{N}_K^{N \times
-    1}`. This layer uses maximum *a-posteriori* inference to learn the weights
-    and so also returns complexity penalities (l1 or l2) for the weights.
+    1}`. This layer uses maximum likelihood or maximum *a-posteriori* inference
+    to learn the weights and so also returns complexity penalities (l1 or l2)
+    for the weights.
 
     Parameters
     ----------
@@ -881,7 +898,7 @@ class EmbedMAP(SampleLayer3):
 
     def __init__(self, output_dim, n_categories, l1_reg=0., l2_reg=0.,
                  init_fn='glorot'):
-        """Create and instance of a MAP embedding layer."""
+        """Create and instance of an embedding layer."""
         assert n_categories >= 2, "Need 2 or more categories for embedding!"
         self.output_dim = output_dim
         self.n_categories = n_categories

diff --git a/demos/classification.py b/demos/classification.py
@@ -34,16 +34,16 @@
 net = ab.stack(
     ab.InputLayer(name='X', n_samples=n_samples_),
     ab.DropOut(0.95, alpha=True),
-    ab.DenseMAP(output_dim=128, l2_reg=REG, init_fn="autonorm"),
+    ab.Dense(output_dim=128, l2_reg=REG, init_fn="autonorm"),
     ab.Activation(h=tf.nn.selu),
     ab.DropOut(0.9, alpha=True),
-    ab.DenseMAP(output_dim=64, l2_reg=REG, init_fn="autonorm"),
+    ab.Dense(output_dim=64, l2_reg=REG, init_fn="autonorm"),
     ab.Activation(h=tf.nn.selu),
     ab.DropOut(0.9, alpha=True),
-    ab.DenseMAP(output_dim=32, l2_reg=REG, init_fn="autonorm"),
+    ab.Dense(output_dim=32, l2_reg=REG, init_fn="autonorm"),
     ab.Activation(h=tf.nn.selu),
     ab.DropOut(0.9, alpha=True),
-    ab.DenseMAP(output_dim=1, l2_reg=REG, init_fn="autonorm"),
+    ab.Dense(output_dim=1, l2_reg=REG, init_fn="autonorm"),
 )
 
 

diff --git a/demos/mnist_softmax_regression.py b/demos/mnist_softmax_regression.py
@@ -23,29 +23,27 @@
 # Network architecture
 net = ab.stack(
     ab.InputLayer(name='X', n_samples=l_samples),  # LSAMPLES,BATCH_SIZE,28*28
-    ab.Conv2DMAP(filters=32,
-                 kernel_size=(5, 5),
-                 l2_reg=reg),  # LSAMPLES, BATCH_SIZE, 28, 28, 32
+    ab.Conv2D(filters=32,
+              kernel_size=(5, 5),
+              l2_reg=reg),  # LSAMPLES, BATCH_SIZE, 28, 28, 32
     ab.Activation(h=tf.nn.relu),
     ab.MaxPool2D(pool_size=(2, 2),
                  strides=(2, 2)),  # LSAMPLES, BATCH_SIZE, 14, 14, 32
 
-    ab.Conv2DMAP(filters=64,
-                 kernel_size=(5, 5),
-                 l2_reg=reg),  # LSAMPLES, BATCH_SIZE, 14, 14, 64
+    ab.Conv2D(filters=64,
+              kernel_size=(5, 5),
+              l2_reg=reg),  # LSAMPLES, BATCH_SIZE, 14, 14, 64
     ab.Activation(h=tf.nn.relu),
     ab.MaxPool2D(pool_size=(2, 2),
                  strides=(2, 2)),  # LSAMPLES, BATCH_SIZE, 7, 7, 64
 
     ab.Flatten(),  # LSAMPLES, BATCH_SIZE, 7*7*64
 
-    ab.DenseMAP(output_dim=1024,
-                l2_reg=reg),  # LSAMPLES, BATCH_SIZE, 1024
+    ab.Dense(output_dim=1024, l2_reg=reg),  # LSAMPLES, BATCH_SIZE, 1024
     ab.Activation(h=tf.nn.relu),
     ab.DropOut(0.5),
 
-    ab.DenseMAP(output_dim=10,
-                l2_reg=reg),  # LSAMPLES, BATCH_SIZE, 10
+    ab.Dense(output_dim=10, l2_reg=reg),  # LSAMPLES, BATCH_SIZE, 10
 )
 
 

diff --git a/demos/regression_tutorial.py b/demos/regression_tutorial.py
@@ -42,7 +42,7 @@ def linear(X, Y):
 
     net = (
         ab.InputLayer(name="X") >>
-        ab.DenseMAP(output_dim=1, l2_reg=lambda_)
+        ab.Dense(output_dim=1, l2_reg=lambda_)
     )
 
     Xw, reg = net(X=X)
@@ -76,13 +76,13 @@ def nnet(X, Y):
 
     net = (
         ab.InputLayer(name="X", n_samples=1) >>
-        ab.DenseMAP(output_dim=40, l2_reg=lambda_) >>
+        ab.Dense(output_dim=40, l2_reg=lambda_) >>
         ab.Activation(tf.tanh) >>
-        ab.DenseMAP(output_dim=20, l2_reg=lambda_) >>
+        ab.Dense(output_dim=20, l2_reg=lambda_) >>
         ab.Activation(tf.tanh) >>
-        ab.DenseMAP(output_dim=10, l2_reg=lambda_) >>
+        ab.Dense(output_dim=10, l2_reg=lambda_) >>
         ab.Activation(tf.tanh) >>
-        ab.DenseMAP(output_dim=1, l2_reg=lambda_)
+        ab.Dense(output_dim=1, l2_reg=lambda_)
     )
 
     f, reg = net(X=X)
@@ -98,15 +98,15 @@ def nnet_dropout(X, Y):
 
     net = (
         ab.InputLayer(name="X", n_samples=n_samples_) >>
-        ab.DenseMAP(output_dim=40, l2_reg=lambda_) >>
+        ab.Dense(output_dim=40, l2_reg=lambda_) >>
         ab.Activation(tf.tanh) >>
-        ab.DropOut(keep_prob=0.9) >>
-        ab.DenseMAP(output_dim=20, l2_reg=lambda_) >>
+        ab.DropOut(keep_prob=0.9, independent=True) >>
+        ab.Dense(output_dim=20, l2_reg=lambda_) >>
         ab.Activation(tf.tanh) >>
-        ab.DropOut(keep_prob=0.95) >>
-        ab.DenseMAP(output_dim=10, l2_reg=lambda_) >>
+        ab.DropOut(keep_prob=0.95, independent=True) >>
+        ab.Dense(output_dim=10, l2_reg=lambda_) >>
         ab.Activation(tf.tanh) >>
-        ab.DenseMAP(output_dim=1, l2_reg=lambda_)
+        ab.Dense(output_dim=1, l2_reg=lambda_)
     )
 
     f, reg = net(X=X)
@@ -149,8 +149,8 @@ def svr(X, Y):
         # ab.InputLayer(name="X", n_samples=n_samples_) >>
         ab.InputLayer(name="X", n_samples=1) >>
         ab.RandomFourier(n_features=50, kernel=kern) >>
-        # ab.DropOut(keep_prob=0.9) >>
-        ab.DenseMAP(output_dim=1, l2_reg=lambda_)
+        # ab.DropOut(keep_prob=0.9, independent=True) >>
+        ab.Dense(output_dim=1, l2_reg=lambda_)
     )
 
     f, reg = net(X=X)

diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -24,7 +24,7 @@ regularisation on the model weights:
 
     layers = (
         ab.InputLayer(name="X") >>
-        ab.DenseMAP(output_dim=1, l1_reg=0, l2_reg=.05) >>
+        ab.Dense(output_dim=1, l1_reg=0, l2_reg=.05) >>
         ab.Activation(tf.nn.sigmoid)
     )
 
@@ -90,7 +90,7 @@ as shorthand for :math:`p(y_n = 1)`.
 
         layers = (
             ab.InputLayer(name="X") >>
-            ab.DenseMAP(output_dim=1, l1_reg=0, l2_reg=.05) >>
+            ab.Dense(output_dim=1, l1_reg=0, l2_reg=.05) >>
         )
         net, reg = layers(X=X_)
 
@@ -258,7 +258,7 @@ following,
         ab.Activation(tf.nn.sigmoid)
     )
 
-Note we are using ``DenseVariational`` instead of ``DenseMAP``. In the
+Note we are using ``DenseVariational`` instead of ``Dense``. In the
 ``DenseVariational`` layer the ``full`` parameter tells the layer to use a full
 covariance Gaussian, and ``prior_std`` is value of the weight prior standard
 deviation, :math:`\psi`. Also we've set ``n_samples=5`` (as a default value of

diff --git a/docs/tutorials/keras_integration.rst b/docs/tutorials/keras_integration.rst
@@ -44,13 +44,13 @@ the layer weights / biases. The following are effectively equivalent:
 
    net = (
       ab.InputLayer(name="X", n_samples=n_samples_) >>
-      ab.DenseMAP(output_dim=64, l2_reg=0.01, l1_reg=0.) >>
+      ab.Dense(output_dim=64, l2_reg=0.01, l1_reg=0.) >>
       ab.Activation(tf.tanh) >>
       ab.DropOut(keep_prob=.5) >>
-      ab.DenseMAP(output_dim=64, l2_reg=0.01, l1_reg=0.) >>
+      ab.Dense(output_dim=64, l2_reg=0.01, l1_reg=0.) >>
       ab.Activation(tf.tanh) >>
       ab.DropOut(keep_prob=.5) >>
-      ab.DenseMAP(output_dim=1, l2_reg=0.01, l1_reg=0.)
+      ab.Dense(output_dim=1, l2_reg=0.01, l1_reg=0.)
    )
 
 .. code:: python

diff --git a/docs/tutorials/some_regressors.rst b/docs/tutorials/some_regressors.rst
@@ -79,7 +79,7 @@ following code,
 
     net = (
         ab.InputLayer(name="X") >>
-        ab.DenseMAP(output_dim=1, l2_reg=lambda_, l1_reg=0.)
+        ab.Dense(output_dim=1, l2_reg=lambda_, l1_reg=0.)
     )
 
     f, reg = net(X=X)
@@ -232,13 +232,13 @@ Aboleth is here:
 
     net = (
         ab.InputLayer(name="X", n_samples=1) >>
-        ab.DenseMAP(output_dim=40, l2_reg=lambda_, l1_reg=0.) >>
+        ab.Dense(output_dim=40, l2_reg=lambda_, l1_reg=0.) >>
         ab.Activation(tf.tanh) >>
-        ab.DenseMAP(output_dim=20, l2_reg=lambda_, l1_reg=0.) >>
+        ab.Dense(output_dim=20, l2_reg=lambda_, l1_reg=0.) >>
         ab.Activation(tf.tanh) >>
-        ab.DenseMAP(output_dim=10, l2_reg=lambda_, l1_reg=0.) >>
+        ab.Dense(output_dim=10, l2_reg=lambda_, l1_reg=0.) >>
         ab.Activation(tf.tanh) >>
-        ab.DenseMAP(output_dim=1, l2_reg=lambda_, l1_reg=0.)
+        ab.Dense(output_dim=1, l2_reg=lambda_, l1_reg=0.)
     )
 
     f, reg = net(X=X)
@@ -267,15 +267,15 @@ layers.
 
     net = (
         ab.InputLayer(name="X", n_samples=n_samples_) >>
-        ab.DenseMAP(output_dim=40, l2_reg=lambda_, l1_reg=0.) >>
+        ab.Dense(output_dim=40, l2_reg=lambda_, l1_reg=0.) >>
         ab.Activation(tf.tanh) >>
-        ab.DropOut(keep_prob=0.9) >>
-        ab.DenseMAP(output_dim=20, l2_reg=lambda_, l1_reg=0.) >>
+        ab.DropOut(keep_prob=0.9, independent=True) >>
+        ab.Dense(output_dim=20, l2_reg=lambda_, l1_reg=0.) >>
         ab.Activation(tf.tanh) >>
-        ab.DropOut(keep_prob=0.95) >>
-        ab.DenseMAP(output_dim=10, l2_reg=lambda_, l1_reg=0.) >>
+        ab.DropOut(keep_prob=0.95, independent=True) >>
+        ab.Dense(output_dim=10, l2_reg=lambda_, l1_reg=0.) >>
         ab.Activation(tf.tanh) >>
-        ab.DenseMAP(output_dim=1, l2_reg=lambda_, l1_reg=0.)
+        ab.Dense(output_dim=1, l2_reg=lambda_, l1_reg=0.)
     )
 
     f, reg = net(X=X)
@@ -376,7 +376,7 @@ The code for this is as follows:
     net = (
         ab.InputLayer(name="X", n_samples=1) >>
         ab.RandomFourier(n_features=50, kernel=kern) >>
-        ab.DenseMAP(output_dim=1, l2_reg=lambda_, l1_reg=0.)
+        ab.Dense(output_dim=1, l2_reg=lambda_, l1_reg=0.)
     )
 
     f, reg = net(X=X)