Skip to content

Commit

Permalink
Merge pull request #201 from data61/feature/#198
Browse files Browse the repository at this point in the history
independent dropout, refactor out MAP, closes #198
  • Loading branch information
dsteinberg committed Aug 28, 2018
2 parents 7429a03 + b84a645 commit 12c6503
Show file tree
Hide file tree
Showing 10 changed files with 96 additions and 79 deletions.
10 changes: 5 additions & 5 deletions aboleth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from .losses import elbo, max_posterior
from .baselayers import stack
from .layers import (Activation, DropOut, MaxPool2D, Flatten, DenseVariational,
EmbedVariational, Conv2DVariational, DenseMAP, EmbedMAP,
Conv2DMAP, InputLayer, RandomFourier, RandomArcCosine)
EmbedVariational, Conv2DVariational, Dense, Embed, Conv2D,
InputLayer, RandomFourier, RandomArcCosine)
from .hlayers import Concat, Sum, PerFeature
from .impute import (MaskInputLayer, MeanImpute, FixedNormalImpute,
LearnedScalarImpute, FixedScalarImpute,
Expand All @@ -28,9 +28,9 @@
'Conv2DVariational',
'DenseVariational',
'EmbedVariational',
'Conv2DMAP',
'DenseMAP',
'EmbedMAP',
'Conv2D',
'Dense',
'Embed',
'RandomFourier',
'RandomArcCosine',
'norm_prior',
Expand Down
55 changes: 36 additions & 19 deletions aboleth/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,11 @@ def __call__(self, X):
Net, KL = super(SampleLayer3, self).__call__(X)
return Net, KL


#
# Activation and Transformation Layers
#


class Activation(Layer):
"""Activation function layer.
Expand Down Expand Up @@ -156,30 +156,45 @@ class DropOut(Layer):
keep_prob : float, Tensor
the probability of keeping an input. See `tf.dropout
<https://www.tensorflow.org/api_docs/python/tf/nn/dropout>`_.
independent: bool
Use independently sampled droput for each observation if ``True``. This
may dramatically increase convergence, but will no longer only sample
the latent function.
observation_axis : int
The axis that indexes the observations (``N``). This will assume the
obserations are on the *second* axis, i.e. ``(n_samples, N, ...)``.
This is so we can repeat the dropout pattern over observations, which
has the effect of dropping out weights consistently, thereby sampling
the "latent function" of the layer.
the "latent function" of the layer. This is only active if
``independent`` is set to ``False``.
alpha : bool
Use alpha dropout (tf.contrib.nn.alpha_dropout) that maintains the self
normalising property of SNNs.
Note
----
If a more complex noise shape, or some other modification to dropout is
required, you can use an Activation layer. E.g.
``ab.Activation(lambda x: tf.nn.dropout(x, **your_args))``.
"""

def __init__(self, keep_prob, observation_axis=1, alpha=False):
def __init__(self, keep_prob, independent=True, observation_axis=1,
alpha=False):
"""Create an instance of a Dropout layer."""
self.keep_prob = keep_prob
self.obsax = observation_axis
self.independent = independent
self.dropout = tf.contrib.nn.alpha_dropout if alpha else tf.nn.dropout

def _build(self, X):
"""Build the graph of this layer."""
# Set noise shape to equivalent to different samples from posterior
# i.e. share the samples along the data-observations axis
noise_shape = tf.concat([tf.shape(X)[:self.obsax], [1],
tf.shape(X)[(self.obsax + 1):]], axis=0)
noise_shape = None
if not self.independent:
noise_shape = tf.concat([tf.shape(X)[:self.obsax], [1],
tf.shape(X)[(self.obsax + 1):]], axis=0)
Net = self.dropout(X, self.keep_prob, noise_shape, seed=next(seedgen))
KL = 0.
return Net, KL
Expand Down Expand Up @@ -687,11 +702,11 @@ def _build(self, X):
return Net, KL


class Conv2DMAP(SampleLayer):
r"""A 2D convolution layer, with maximum a posteriori (MAP) inference.
class Conv2D(SampleLayer):
r"""A 2D convolution layer.
This layer uses maximum *a-posteriori* inference to learn the
convolutional kernels and biases, and so also returns complexity
This layer uses maximum likelihood or maximum *a-posteriori* inference to
learn the convolutional kernels and biases, and so also returns complexity
penalities (l1 or l2) for the weights and biases.
Parameters
Expand Down Expand Up @@ -771,8 +786,8 @@ def _weight_shapes(self, channels):
return weight_shape, bias_shape


class DenseMAP(SampleLayer):
r"""Dense (fully connected) linear layer, with MAP inference.
class Dense(SampleLayer):
r"""Dense (fully connected) linear layer.
This implements a linear layer, and when called returns
Expand All @@ -782,8 +797,9 @@ class DenseMAP(SampleLayer):
where :math:`\mathbf{X} \in \mathbb{R}^{N \times D_{in}}`,
:math:`\mathbf{W} \in \mathbb{R}^{D_{in} \times D_{out}}` and
:math:`\mathbf{b} \in \mathbb{R}^{D_{out}}`. This layer uses maximum
*a-posteriori* inference to learn the weights and biases, and so also
returns complexity penalities (l1 or l2) for the weights and biases.
likelihood or maximum *a-posteriori* inference to learn the weights and
biases, and so also returns complexity penalities (l1 or l2) for the
weights and biases.
Parameters
----------
Expand All @@ -807,7 +823,7 @@ class DenseMAP(SampleLayer):

def __init__(self, output_dim, l1_reg=0., l2_reg=0., use_bias=True,
init_fn='glorot'):
"""Create and instance of a dense layer with MAP regularizers."""
"""Create and instance of a dense layer with regularizers."""
self.output_dim = output_dim
self.l1 = l1_reg
self.l2 = l2_reg
Expand Down Expand Up @@ -841,8 +857,8 @@ def _build(self, X):
return Net, penalty


class EmbedMAP(SampleLayer3):
r"""Dense (fully connected) embedding layer, with MAP inference.
class Embed(SampleLayer3):
r"""Dense (fully connected) embedding layer.
This layer works directly inputs of *K* category *indices* rather than
one-hot representations, for efficiency. Each column of the input is
Expand All @@ -855,8 +871,9 @@ class EmbedMAP(SampleLayer3):
Here :math:`\mathbf{X} \in \mathbb{N}_2^{N \times K}` and :math:`\mathbf{W}
\in \mathbb{R}^{K \times D_{out}}`. Though in code we represent
:math:`\mathbf{X}` as a vector of indices in :math:`\mathbb{N}_K^{N \times
1}`. This layer uses maximum *a-posteriori* inference to learn the weights
and so also returns complexity penalities (l1 or l2) for the weights.
1}`. This layer uses maximum likelihood or maximum *a-posteriori* inference
to learn the weights and so also returns complexity penalities (l1 or l2)
for the weights.
Parameters
----------
Expand All @@ -881,7 +898,7 @@ class EmbedMAP(SampleLayer3):

def __init__(self, output_dim, n_categories, l1_reg=0., l2_reg=0.,
init_fn='glorot'):
"""Create and instance of a MAP embedding layer."""
"""Create and instance of an embedding layer."""
assert n_categories >= 2, "Need 2 or more categories for embedding!"
self.output_dim = output_dim
self.n_categories = n_categories
Expand Down
8 changes: 4 additions & 4 deletions demos/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,16 @@
net = ab.stack(
ab.InputLayer(name='X', n_samples=n_samples_),
ab.DropOut(0.95, alpha=True),
ab.DenseMAP(output_dim=128, l2_reg=REG, init_fn="autonorm"),
ab.Dense(output_dim=128, l2_reg=REG, init_fn="autonorm"),
ab.Activation(h=tf.nn.selu),
ab.DropOut(0.9, alpha=True),
ab.DenseMAP(output_dim=64, l2_reg=REG, init_fn="autonorm"),
ab.Dense(output_dim=64, l2_reg=REG, init_fn="autonorm"),
ab.Activation(h=tf.nn.selu),
ab.DropOut(0.9, alpha=True),
ab.DenseMAP(output_dim=32, l2_reg=REG, init_fn="autonorm"),
ab.Dense(output_dim=32, l2_reg=REG, init_fn="autonorm"),
ab.Activation(h=tf.nn.selu),
ab.DropOut(0.9, alpha=True),
ab.DenseMAP(output_dim=1, l2_reg=REG, init_fn="autonorm"),
ab.Dense(output_dim=1, l2_reg=REG, init_fn="autonorm"),
)


Expand Down
18 changes: 8 additions & 10 deletions demos/mnist_softmax_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,29 +23,27 @@
# Network architecture
net = ab.stack(
ab.InputLayer(name='X', n_samples=l_samples), # LSAMPLES,BATCH_SIZE,28*28
ab.Conv2DMAP(filters=32,
kernel_size=(5, 5),
l2_reg=reg), # LSAMPLES, BATCH_SIZE, 28, 28, 32
ab.Conv2D(filters=32,
kernel_size=(5, 5),
l2_reg=reg), # LSAMPLES, BATCH_SIZE, 28, 28, 32
ab.Activation(h=tf.nn.relu),
ab.MaxPool2D(pool_size=(2, 2),
strides=(2, 2)), # LSAMPLES, BATCH_SIZE, 14, 14, 32

ab.Conv2DMAP(filters=64,
kernel_size=(5, 5),
l2_reg=reg), # LSAMPLES, BATCH_SIZE, 14, 14, 64
ab.Conv2D(filters=64,
kernel_size=(5, 5),
l2_reg=reg), # LSAMPLES, BATCH_SIZE, 14, 14, 64
ab.Activation(h=tf.nn.relu),
ab.MaxPool2D(pool_size=(2, 2),
strides=(2, 2)), # LSAMPLES, BATCH_SIZE, 7, 7, 64

ab.Flatten(), # LSAMPLES, BATCH_SIZE, 7*7*64

ab.DenseMAP(output_dim=1024,
l2_reg=reg), # LSAMPLES, BATCH_SIZE, 1024
ab.Dense(output_dim=1024, l2_reg=reg), # LSAMPLES, BATCH_SIZE, 1024
ab.Activation(h=tf.nn.relu),
ab.DropOut(0.5),

ab.DenseMAP(output_dim=10,
l2_reg=reg), # LSAMPLES, BATCH_SIZE, 10
ab.Dense(output_dim=10, l2_reg=reg), # LSAMPLES, BATCH_SIZE, 10
)


Expand Down
26 changes: 13 additions & 13 deletions demos/regression_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def linear(X, Y):

net = (
ab.InputLayer(name="X") >>
ab.DenseMAP(output_dim=1, l2_reg=lambda_)
ab.Dense(output_dim=1, l2_reg=lambda_)
)

Xw, reg = net(X=X)
Expand Down Expand Up @@ -76,13 +76,13 @@ def nnet(X, Y):

net = (
ab.InputLayer(name="X", n_samples=1) >>
ab.DenseMAP(output_dim=40, l2_reg=lambda_) >>
ab.Dense(output_dim=40, l2_reg=lambda_) >>
ab.Activation(tf.tanh) >>
ab.DenseMAP(output_dim=20, l2_reg=lambda_) >>
ab.Dense(output_dim=20, l2_reg=lambda_) >>
ab.Activation(tf.tanh) >>
ab.DenseMAP(output_dim=10, l2_reg=lambda_) >>
ab.Dense(output_dim=10, l2_reg=lambda_) >>
ab.Activation(tf.tanh) >>
ab.DenseMAP(output_dim=1, l2_reg=lambda_)
ab.Dense(output_dim=1, l2_reg=lambda_)
)

f, reg = net(X=X)
Expand All @@ -98,15 +98,15 @@ def nnet_dropout(X, Y):

net = (
ab.InputLayer(name="X", n_samples=n_samples_) >>
ab.DenseMAP(output_dim=40, l2_reg=lambda_) >>
ab.Dense(output_dim=40, l2_reg=lambda_) >>
ab.Activation(tf.tanh) >>
ab.DropOut(keep_prob=0.9) >>
ab.DenseMAP(output_dim=20, l2_reg=lambda_) >>
ab.DropOut(keep_prob=0.9, independent=True) >>
ab.Dense(output_dim=20, l2_reg=lambda_) >>
ab.Activation(tf.tanh) >>
ab.DropOut(keep_prob=0.95) >>
ab.DenseMAP(output_dim=10, l2_reg=lambda_) >>
ab.DropOut(keep_prob=0.95, independent=True) >>
ab.Dense(output_dim=10, l2_reg=lambda_) >>
ab.Activation(tf.tanh) >>
ab.DenseMAP(output_dim=1, l2_reg=lambda_)
ab.Dense(output_dim=1, l2_reg=lambda_)
)

f, reg = net(X=X)
Expand Down Expand Up @@ -149,8 +149,8 @@ def svr(X, Y):
# ab.InputLayer(name="X", n_samples=n_samples_) >>
ab.InputLayer(name="X", n_samples=1) >>
ab.RandomFourier(n_features=50, kernel=kern) >>
# ab.DropOut(keep_prob=0.9) >>
ab.DenseMAP(output_dim=1, l2_reg=lambda_)
# ab.DropOut(keep_prob=0.9, independent=True) >>
ab.Dense(output_dim=1, l2_reg=lambda_)
)

f, reg = net(X=X)
Expand Down
6 changes: 3 additions & 3 deletions docs/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ regularisation on the model weights:
layers = (
ab.InputLayer(name="X") >>
ab.DenseMAP(output_dim=1, l1_reg=0, l2_reg=.05) >>
ab.Dense(output_dim=1, l1_reg=0, l2_reg=.05) >>
ab.Activation(tf.nn.sigmoid)
)
Expand Down Expand Up @@ -90,7 +90,7 @@ as shorthand for :math:`p(y_n = 1)`.

layers = (
ab.InputLayer(name="X") >>
ab.DenseMAP(output_dim=1, l1_reg=0, l2_reg=.05) >>
ab.Dense(output_dim=1, l1_reg=0, l2_reg=.05) >>
)
net, reg = layers(X=X_)

Expand Down Expand Up @@ -258,7 +258,7 @@ following,
ab.Activation(tf.nn.sigmoid)
)
Note we are using ``DenseVariational`` instead of ``DenseMAP``. In the
Note we are using ``DenseVariational`` instead of ``Dense``. In the
``DenseVariational`` layer the ``full`` parameter tells the layer to use a full
covariance Gaussian, and ``prior_std`` is value of the weight prior standard
deviation, :math:`\psi`. Also we've set ``n_samples=5`` (as a default value of
Expand Down
6 changes: 3 additions & 3 deletions docs/tutorials/keras_integration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,13 @@ the layer weights / biases. The following are effectively equivalent:
net = (
ab.InputLayer(name="X", n_samples=n_samples_) >>
ab.DenseMAP(output_dim=64, l2_reg=0.01, l1_reg=0.) >>
ab.Dense(output_dim=64, l2_reg=0.01, l1_reg=0.) >>
ab.Activation(tf.tanh) >>
ab.DropOut(keep_prob=.5) >>
ab.DenseMAP(output_dim=64, l2_reg=0.01, l1_reg=0.) >>
ab.Dense(output_dim=64, l2_reg=0.01, l1_reg=0.) >>
ab.Activation(tf.tanh) >>
ab.DropOut(keep_prob=.5) >>
ab.DenseMAP(output_dim=1, l2_reg=0.01, l1_reg=0.)
ab.Dense(output_dim=1, l2_reg=0.01, l1_reg=0.)
)
.. code:: python
Expand Down
24 changes: 12 additions & 12 deletions docs/tutorials/some_regressors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ following code,
net = (
ab.InputLayer(name="X") >>
ab.DenseMAP(output_dim=1, l2_reg=lambda_, l1_reg=0.)
ab.Dense(output_dim=1, l2_reg=lambda_, l1_reg=0.)
)
f, reg = net(X=X)
Expand Down Expand Up @@ -232,13 +232,13 @@ Aboleth is here:
net = (
ab.InputLayer(name="X", n_samples=1) >>
ab.DenseMAP(output_dim=40, l2_reg=lambda_, l1_reg=0.) >>
ab.Dense(output_dim=40, l2_reg=lambda_, l1_reg=0.) >>
ab.Activation(tf.tanh) >>
ab.DenseMAP(output_dim=20, l2_reg=lambda_, l1_reg=0.) >>
ab.Dense(output_dim=20, l2_reg=lambda_, l1_reg=0.) >>
ab.Activation(tf.tanh) >>
ab.DenseMAP(output_dim=10, l2_reg=lambda_, l1_reg=0.) >>
ab.Dense(output_dim=10, l2_reg=lambda_, l1_reg=0.) >>
ab.Activation(tf.tanh) >>
ab.DenseMAP(output_dim=1, l2_reg=lambda_, l1_reg=0.)
ab.Dense(output_dim=1, l2_reg=lambda_, l1_reg=0.)
)
f, reg = net(X=X)
Expand Down Expand Up @@ -267,15 +267,15 @@ layers.
net = (
ab.InputLayer(name="X", n_samples=n_samples_) >>
ab.DenseMAP(output_dim=40, l2_reg=lambda_, l1_reg=0.) >>
ab.Dense(output_dim=40, l2_reg=lambda_, l1_reg=0.) >>
ab.Activation(tf.tanh) >>
ab.DropOut(keep_prob=0.9) >>
ab.DenseMAP(output_dim=20, l2_reg=lambda_, l1_reg=0.) >>
ab.DropOut(keep_prob=0.9, independent=True) >>
ab.Dense(output_dim=20, l2_reg=lambda_, l1_reg=0.) >>
ab.Activation(tf.tanh) >>
ab.DropOut(keep_prob=0.95) >>
ab.DenseMAP(output_dim=10, l2_reg=lambda_, l1_reg=0.) >>
ab.DropOut(keep_prob=0.95, independent=True) >>
ab.Dense(output_dim=10, l2_reg=lambda_, l1_reg=0.) >>
ab.Activation(tf.tanh) >>
ab.DenseMAP(output_dim=1, l2_reg=lambda_, l1_reg=0.)
ab.Dense(output_dim=1, l2_reg=lambda_, l1_reg=0.)
)
f, reg = net(X=X)
Expand Down Expand Up @@ -376,7 +376,7 @@ The code for this is as follows:
net = (
ab.InputLayer(name="X", n_samples=1) >>
ab.RandomFourier(n_features=50, kernel=kern) >>
ab.DenseMAP(output_dim=1, l2_reg=lambda_, l1_reg=0.)
ab.Dense(output_dim=1, l2_reg=lambda_, l1_reg=0.)
)
f, reg = net(X=X)
Expand Down

0 comments on commit 12c6503

Please sign in to comment.