In [1]:
import random

import numpy as np
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow_probability as tfp
tfd = tfp.distributions
tfb = tfp.bijectors
tfpl = tfp.layers
tfkl = tfk.layers

In [3]:
batch_size = 8
latent_space_size = 12

cond = tf.random.uniform(shape=(batch_size, 32))

autoregressive_net = tfb.AutoregressiveNetwork(
    params=1,
    event_shape=[latent_space_size],
    hidden_units=[128, 128],
    activation='relu',
    conditional=True,
    conditional_event_shape=tf.shape(cond)[1: ])

distribution = tfd.Autoregressive(
    distribution_fn=lambda x: tfd.RelaxedBernoulli(
        logits=tf.unstack(autoregressive_net(x, conditional_input=cond), axis=-1)[0],
        temperature=1./2.),
    sample0=tf.zeros([batch_size, latent_space_size]),)

2021-12-23 16:53:19.339436: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1-step Markov chain estimation
$s_0 -> \frac{1}{2}:  s_1 + \frac{1}{4}: s_2 + \frac{1}{4}: s_3$

$s_0$ is initial (label $00$), $s_1$ is safe ($10$), and $s_2, s_3$ are unsafe ($01$), i.e.,\
$s_0$: $000$, \
$s_1$: $100$, \
$s_2$: $010$, \
$s_3$: $011$

In [2]:
states = tf.constant([[0., 0., 0.], [1., 0., 0.], [0., 1., 0.], [0., 1., 1.]])

2022-01-14 16:41:48.541693: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
dataset_size = 1024 * 100

dataset = tf.stack([
    # toss a coin, if heads then s_0 goes to s_1
    states[1] if random.random() <= 0.5 else
    # else (if tails), then toss the coin again; if heads, then s_0 goes to s_2
    (states[2] if random.random() <= 0.5 else
    # otherwise, s_0 goes to s_3
    states[3]) for _ in range(dataset_size)
    ])

dataset

<tf.Tensor: shape=(102400, 3), dtype=float32, numpy=
array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 1.],
       [0., 1., 0.],
       [1., 0., 0.]], dtype=float32)>

In [4]:
tf.reduce_sum(dataset, axis=0) / dataset_size

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.4988086 , 0.5011914 , 0.25016603], dtype=float32)>

In [19]:
batch_size = 8
event_shape = (3, )

"""
x = tfk.Input(shape=(3, ))
model = tfk.Model(
    [x],
    tfd.Independent(
        tfd.Bernoulli(logits=logits),
        reinterpreted_batch_ndims=1
    ).log_prob(x))
"""

model = tfk.Sequential([
    tfkl.Dense(tfpl.IndependentBernoulli.params_size(event_shape)),
    tfpl.IndependentBernoulli(event_shape),
])

model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss= lambda y, model: -model.log_prob(y))
                     
model.fit(
    x=tf.zeros(shape=(dataset_size, 1)),  # always provide the same input
    y=dataset,
    batch_size=batch_size,
    epochs=4,
    steps_per_epoch=dataset_size // batch_size,
    shuffle=True,
    verbose=True)

bernoulli = model(tf.zeros(shape=(1, 1)))
tf.print("Bernoulli probs", bernoulli.mean())
tf.print("P(s_1 | s_0) =", tf.exp(bernoulli.log_prob(states[1])))
tf.print("P(s_2 | s_0) =", tf.exp(bernoulli.log_prob(states[2])))
tf.print("P(s_3 | s_0) =", tf.exp(bernoulli.log_prob(states[3])))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Bernoulli probs [[0.55006218 0.44993785 0.244285166]]
P(s_1 | s_0) = [0.228655428]
P(s_2 | s_0) = [0.15299]
P(s_3 | s_0) = [0.0494540781]


In [20]:
batch_size = 8
event_shape = (3, )

autoregressor = tfb.AutoregressiveNetwork(
    params=1,
    event_shape=event_shape,
    hidden_units=[64, 64],
    activation='relu')

distribution = tfd.Autoregressive(
    lambda x: tfd.Independent(
        tfd.Bernoulli(logits=tf.unstack(autoregressor(x), axis=-1)[0]),
        reinterpreted_batch_ndims=1),
    sample0=tf.zeros(event_shape))

x = tfk.Input(shape=event_shape)
log_prob = distribution.log_prob(x)
model = tfk.Model(x, log_prob)

model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=1e-2),
    loss= lambda _, log_prob: -log_prob)

model.fit(
    x=dataset,
    y=tf.zeros(shape=(dataset_size, )), # log_prob = 0 <=> prob = 1
    batch_size=batch_size,
    epochs=4,
    steps_per_epoch=dataset_size // batch_size,
    shuffle=True,
    verbose=True)

tf.print("P(s_1 | s_0) =", distribution.prob(states[1]))
tf.print("P(s_2 | s_0) =", distribution.prob(states[2]))
tf.print("P(s_3 | s_0) =", distribution.prob(states[3]))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
P(s_1 | s_0) = 0.477598876
P(s_2 | s_0) = 0.242849588
P(s_3 | s_0) = 0.279551566


# With a MaskedAutoregressiveFlow

In [41]:
batch_size = 8
event_shape = (3, )

made = tfb.AutoregressiveNetwork(
    params=1,
    hidden_units=[64, 64],
    activation='relu')

distribution = tfd.TransformedDistribution(
    distribution=tfd.Sample(
        tfd.Independent(
            tfd.Bernoulli(logits=tf.zeros(shape=event_shape, dtype=tf.float32), dtype=tf.float32),
            reinterpreted_batch_ndims=1),),
    bijector=tfb.Invert(tfb.MaskedAutoregressiveFlow(
        lambda y: (made(y)[..., 0], None),
        is_constant_jacobian=False)))
distribution._made = made


x = tfk.Input(shape=event_shape)
log_prob = distribution.log_prob(x)
model = tfk.Model(x, log_prob)

model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=1e-2),
    loss= lambda _, log_prob: -log_prob)

model.fit(
    x=dataset,
    y=tf.zeros(shape=(dataset_size, )), # log_prob = 0 <=> prob = 1
    batch_size=batch_size,
    epochs=4,
    steps_per_epoch=dataset_size // batch_size,
    shuffle=True,
    verbose=True)

bernoulli = distribution
tf.print("P(s_1 | s_0) =", distribution.prob(states[1]))
tf.print("P(s_2 | s_0) =", distribution.prob(states[2]))
tf.print("P(s_3 | s_0) =", distribution.prob(states[3]))

TypeError: You are passing KerasTensor(type_spec=TensorSpec(shape=(), dtype=tf.int32, name=None), inferred_value=[3], name='tf.math.reduce_prod_3/Prod:0', description="created by layer 'tf.math.reduce_prod_3'"), an intermediate Keras symbolic input/output, to a TF API that does not allow registering custom dispatchers, such as `tf.cond`, `tf.function`, gradient tapes, or `tf.map_fn`. Keras Functional model construction only supports TF API calls that *do* support dispatching, such as `tf.math.add` or `tf.reshape`. Other APIs cannot be called directly on symbolic Kerasinputs/outputs. You can work around this limitation by putting the operation in a custom Keras layer `call` and calling that layer on this symbolic input/output.

In [46]:
batch_size=8
event_shape=(3, )

made = tfb.AutoregressiveNetwork(
    params=1,
    hidden_units=[128, 128],
    # event_shape=event_shape,
    activation='relu',
    name="InvertMaskedAutoregressorNetwork")

distribution = tfd.TransformedDistribution(
    distribution=tfd.Sample(
        tfd.Independent(
            tfd.Bernoulli(logits=tf.zeros(shape=event_shape, dtype=tf.float32), dtype=tf.float32),
            reinterpreted_batch_ndims=1),),
    bijector=tfb.Invert(tfb.MaskedAutoregressiveFlow(
        lambda y: (made(y)[..., 0], None),
        is_constant_jacobian=True)))
distribution._made = made

print(distribution._made.weights)
distribution.sample(8)

[]


<tf.Tensor: shape=(8, 3), dtype=float32, numpy=
array([[0.        , 1.        , 1.0385809 ],
       [0.        , 1.        , 1.0385809 ],
       [1.        , 0.03900308, 0.09938622],
       [0.        , 1.        , 0.03858088],
       [0.        , 1.        , 0.03858088],
       [1.        , 0.03900308, 1.0993862 ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.        ]], dtype=float32)>

In [139]:
batch_size=8
event_shape=(3, )
temperature = 1e-5

scale = 3.
softclip = tfb.Chain([tfb.Scale(scale), tfb.Tanh(), tfb.Scale(1. / scale)], name="softclip")
minus_log = tfb.Chain([tfb.Scale(-1.), tfb.Log()], name="minus_log")

made = tfb.AutoregressiveNetwork(
    params=1,
    hidden_units=[128, 128],
    activation=tfb.Softplus(),
    name="InvertMaskedAutoregressorNetwork")

distribution = tfd.TransformedDistribution(
    distribution=tfd.Sample(
        tfd.Independent(
            distribution=tfd.Logistic(
                loc=tf.zeros(shape=event_shape, dtype=tf.float32),
                scale=1. / temperature),
            reinterpreted_batch_ndims=1)),
    bijector=tfb.Invert(tfb.MaskedAutoregressiveFlow(
        lambda y: (
            tfb.Chain([tfb.Scale(1. / temperature), softclip])(made(y)[..., 0]),
            minus_log(temperature)),
        is_constant_jacobian=False)))
distribution._made = made

distribution = tfd.TransformedDistribution(
    distribution=distribution,
    bijector=tfb.Sigmoid())

distribution.sample(batch_size)

<tf.Tensor: shape=(8, 3), dtype=float32, numpy=
array([[0.07339588, 0.9487939 , 0.9456719 ],
       [0.7476593 , 0.00531152, 0.97534597],
       [0.8681984 , 0.3399576 , 0.95810634],
       [0.27695602, 0.9912412 , 0.96136844],
       [0.93835366, 0.00770125, 0.7739607 ],
       [0.07491547, 0.9981222 , 0.98167545],
       [0.47962868, 0.989711  , 0.922577  ],
       [0.01802978, 0.9891161 , 0.8602406 ]], dtype=float32)>

In [146]:
batch_size=8
event_shape=(3, )
temperature = 1. / 2.

scale = 3.
softclip = tfb.Chain([tfb.Scale(scale), tfb.Tanh(), tfb.Scale(1. / scale)], name="softclip")
minus_log = tfb.Chain([tfb.Scale(-1.), tfb.Log()], name="minus_log")

made = tfb.AutoregressiveNetwork(
    params=1,
    hidden_units=[128, 128],
    activation=tfb.Softplus(),
    name="InvertMaskedAutoregressorNetwork")

distribution = tfd.TransformedDistribution(
    distribution=tfd.Sample(
        tfd.Independent(
            distribution=tfd.Logistic(
                loc=tf.zeros(shape=event_shape, dtype=tf.float32),
                scale=1. / temperature),
            reinterpreted_batch_ndims=1)),
    bijector=tfb.Invert(tfb.MaskedAutoregressiveFlow(
        lambda y: (
            tfb.Chain([tfb.Scale(1. / temperature), softclip])(made(y)[..., 0]),
            # minus_log(temperature)),
            None),
        is_constant_jacobian=True)))
distribution._made = made

distribution = tfd.TransformedDistribution(
    distribution=distribution,
    bijector=tfb.Sigmoid())

distribution.sample(batch_size)

<tf.Tensor: shape=(8, 3), dtype=float32, numpy=
array([[0.95060885, 0.93537664, 0.7153141 ],
       [0.2801799 , 0.9603685 , 0.5810608 ],
       [0.14436802, 0.6016128 , 0.24179214],
       [0.361446  , 0.5698984 , 0.9822776 ],
       [0.7000871 , 0.2468996 , 0.48104683],
       [0.95207596, 0.6956586 , 0.71504086],
       [0.9911182 , 0.98973346, 0.6715649 ],
       [0.2954213 , 0.9990678 , 0.9973613 ]], dtype=float32)>

In [125]:
print(minus_log(temperature))
print(- tf.math.log(temperature))
print(tf.math.log(1. / temperature))
print(softclip(10.))
print(3. * tf.nn.tanh(10. / 3.))

tf.Tensor(11.512925, shape=(), dtype=float32)
tf.Tensor(11.512925, shape=(), dtype=float32)
tf.Tensor(11.512925, shape=(), dtype=float32)
tf.Tensor(2.9923737, shape=(), dtype=float32)
tf.Tensor(2.9923737, shape=(), dtype=float32)


In [231]:
batch_size=8
event_shape=(3, )
cond_shape=(5, )
temperature = 1. / 2.

scale = 3.
softclip = tfb.Chain([tfb.Scale(scale), tfb.Tanh(), tfb.Scale(1. / scale)], name="softclip")

x = tfk.Input(shape=event_shape, dtype=tf.float32)
y = tfk.Input(shape=cond_shape, dtype=tf.float32)

made = tfb.AutoregressiveNetwork(
    params=1,
    hidden_units=[128, 128],
    event_shape=event_shape,
    conditional=True,
    conditional_event_shape=cond_shape,
    activation=tfb.Softplus(),
    name="InvertMaskedAutoregressorNetwork")(x, y)

made = tfkl.Lambda(
    lambda x: tf.stack([
        tfb.Chain([tfb.Scale(1. / temperature), softclip])(x[..., 0]),
        tf.zeros(tf.shape(x)[:-1])],
    axis=-1)
)(made)


model = tfk.Model(inputs=[x, y], outputs=made)
model([tf.zeros((8, ) + event_shape), tf.ones((8, ) + cond_shape)])

<tf.Tensor: shape=(8, 3, 2), dtype=float32, numpy=
array([[[-2.110426 ,  0.       ],
        [ 3.2337317,  0.       ],
        [ 2.5936074,  0.       ]],

       [[-2.110426 ,  0.       ],
        [ 3.2337317,  0.       ],
        [ 2.5936074,  0.       ]],

       [[-2.110426 ,  0.       ],
        [ 3.2337317,  0.       ],
        [ 2.5936074,  0.       ]],

       [[-2.110426 ,  0.       ],
        [ 3.2337317,  0.       ],
        [ 2.5936074,  0.       ]],

       [[-2.110426 ,  0.       ],
        [ 3.2337317,  0.       ],
        [ 2.5936074,  0.       ]],

       [[-2.110426 ,  0.       ],
        [ 3.2337317,  0.       ],
        [ 2.5936074,  0.       ]],

       [[-2.110426 ,  0.       ],
        [ 3.2337317,  0.       ],
        [ 2.593608 ,  0.       ]],

       [[-2.110426 ,  0.       ],
        [ 3.2337317,  0.       ],
        [ 2.593608 ,  0.       ]]], dtype=float32)>

### Note: Correct one

### Inverse masked autoregressive flow without Logistic

In [4]:
batch_size=8
event_shape=(3, )
cond_shape=(5, )
temperature = 1e-5

scale = 3.
softclip = tfb.SoftClip(low=-scale, high=scale)

class AutoRegressiveLogistic(tf.Module):
    
    def __init__(self):
        super(AutoRegressiveLogistic, self).__init__()
        x = tfk.Input(shape=event_shape, dtype=tf.float32)
        y = tfk.Input(shape=cond_shape, dtype=tf.float32)

        self._made = tfb.AutoregressiveNetwork(
            params=1,
            hidden_units=[128, 128],
            event_shape=event_shape,
            conditional=True,
            conditional_event_shape=cond_shape,
            activation=tfb.Softplus(),
            name="InvertMaskedAutoregressorNetwork")
        made = self._made(x, y)
        made = tfkl.Lambda(
            lambda x: tfb.Chain([tfb.Scale(1. / temperature), softclip])(x[..., 0])
        )(made)
        self.made = tfk.Model(inputs=[x, y], outputs=made)

        
    def autoregressive_distribution(self, conditional):
        batch_size = tf.shape(conditional)[0]
        distribution = tfd.TransformedDistribution(
        distribution=tfd.Sample(
            tfd.Independent(
                distribution=tfd.Logistic(
                    loc=tf.zeros(shape=(batch_size, ) + event_shape, dtype=tf.float32),
                    scale=1. / temperature),
                reinterpreted_batch_ndims=1)),
        bijector=tfb.Invert(tfb.MaskedAutoregressiveFlow(
            lambda x: (self.made([x, conditional]), None),
            is_constant_jacobian=False)))

        return tfd.TransformedDistribution(
            distribution=distribution,
            bijector=tfb.Sigmoid())

    
    @tf.function
    def __call__(self, conditional):
        return self.autoregressive_distribution(conditional).sample()


x = AutoRegressiveLogistic()(tf.ones((8, ) + cond_shape))
print(x)

tf.Tensor(
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 0.]
 [0. 1. 1.]
 [1. 0. 0.]
 [1. 0. 1.]
 [0. 1. 1.]
 [0. 1. 1.]], shape=(8, 3), dtype=float32)


In [70]:
d = AutoRegressiveLogistic().autoregressive_distribution(tf.ones((8, ) + cond_shape))
distribution = tfd.TransformedDistribution(
    distribution=d,
    bijector=tfb.Inline(
        forward_fn=tf.round,
        inverse_fn=tfb.Identity,
        forward_min_event_ndims=0,
        inverse_min_event_ndims=0))


print(distribution.sample())
print(d.prob(tf.clip_by_value(tf.abs(distribution.sample() - 1.), clip_value_min=1e-7, clip_value_max=1. - 1e-7)))
print(d.prob(tf.clip_by_value(tf.round(x), clip_value_min=1e-7, clip_value_max=1. - 1e-7)))

tf.Tensor(
[[1. 1. 1.]
 [0. 1. 0.]
 [1. 0. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [0. 1. 0.]
 [1. 1. 1.]
 [0. 1. 0.]], shape=(8, 3), dtype=float32)
tf.Tensor(
[458.6469  322.86954 458.6469  458.6469  384.77975 458.6469  458.6469
 458.6469 ], shape=(8,), dtype=float32)
tf.Tensor(
[458.6469  384.85165 384.85165 384.77975 384.77975 384.77975 322.86954
 384.85165], shape=(8,), dtype=float32)


In [208]:
batch_size=8
event_shape=(3, )
cond_shape=(5, )
temperature = 0.5

scale = 3.
softclip = tfb.SoftClip(low=-scale, high=scale)

class AutoRegressiveBernoulli(tfk.Model):
    
    def __init__(self, event_shape, cond_shape=None):
        
        x = tfk.Input(shape=event_shape, dtype=tf.float32)
        if cond_shape is not None:
            y = tfk.Input(shape=cond_shape, dtype=tf.float32)
        else:
            y = None
        
        _made = tfb.AutoregressiveNetwork(
            params=1,
            hidden_units=[128, 128],
            event_shape=event_shape,
            conditional=cond_shape is not None,
            conditional_event_shape=cond_shape,
            activation=tfb.Softplus())
        made = _made(x, conditional_input=y)
        made = tfkl.Lambda(
            lambda x: softclip(x[..., 0])
        )(made)
        
        super(AutoRegressiveBernoulli, self).__init__(inputs=x if y is None else [x, y], outputs=made)

    @staticmethod
    def _process_made_inputs(x, conditional = None, *args, **kwargs):
        return [x, conditional] if conditional is not None else x

    def relaxed_distribution(self, temperature, conditional=None):

        def distribution_fn(x=None):
            if x is None:
                return tfd.Independent(
                    distribution=tfd.TransformedDistribution(
                        distribution=tfd.Logistic(
                            loc=tf.zeros(shape=event_shape, dtype=tf.float32),
                            scale=1. / temperature),
                        bijector=tfb.Sigmoid()),
                    reinterpreted_batch_ndims=1)
            else:
                inputs = self._process_made_inputs(x, conditional)
                return tfd.Independent(
                    distribution=tfd.TransformedDistribution(
                        distribution=tfd.Logistic(
                            loc=self(inputs) / temperature,
                            scale=1. / temperature),
                        bijector=tfb.Sigmoid()),
                    reinterpreted_batch_ndims=1)


        print(distribution_fn().event_shape)
        return tfd.Autoregressive(distribution_fn)
    
    def discrete_distribution(self, conditional=None):
        
        def distribution_fn(x=None):
            if x is None:
                return tfd.Independent(
                    distribution=tfd.Bernoulli(logits=tf.zeros(event_shape)),
                    reinterpreted_batch_ndims=1)
            else:
                inputs = self._process_made_inputs(x, conditional)
                return tfd.Independent(
                    distribution=tfd.Bernoulli(logits=self(inputs)),
                    reinterpreted_batch_ndims=1)
        
        return tfd.Autoregressive(distribution_fn)

Without conditionals

In [203]:
event_shape=(3, )

autoregressive_model = AutoRegressiveBernoulli(event_shape=event_shape)
distribution = autoregressive_model.relaxed_distribution(temperature=.5)
x = tf.round(distribution.sample(8))
print(x)
print(distribution.prob(tf.clip_by_value(x, clip_value_min=1e-7, clip_value_max=1.-1e-7)))
discrete_distribution = autoregressive_model.discrete_distribution()
print(discrete_distribution.sample(8))
print(discrete_distribution.prob(x))
autoregressive_model.relaxed_distribution(temperature=.5).sample()

tf.Tensor(
[[0. 1. 1.]
 [0. 1. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 1.]
 [1. 1. 0.]
 [1. 1. 0.]], shape=(8, 3), dtype=float32)
tf.Tensor(
[1.0413770e+10 1.0413770e+10 3.7047258e+09 3.7047258e+09 1.1448654e+09
 3.3035971e+09 3.2323855e+09 3.2323855e+09], shape=(8,), dtype=float32)
tf.Tensor(
[[1 0 0]
 [1 1 1]
 [0 1 1]
 [1 1 1]
 [0 1 1]
 [1 0 1]
 [1 1 0]
 [1 0 0]], shape=(8, 3), dtype=int32)
tf.Tensor(
[0.20455365 0.20455365 0.11671299 0.11671299 0.06483471 0.11513042
 0.11348242 0.11348242], shape=(8,), dtype=float32)


<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.49189883, 0.22471553, 0.9895641 ], dtype=float32)>

With conditionals

In [211]:
event_shape=(3, )
cond_shape=(5, )
batch_size = 8

autoregressive_model = AutoRegressiveBernoulli(event_shape=event_shape, cond_shape=cond_shape)
conditional_samples = tf.ones((batch_size, ) + cond_shape)
distribution = autoregressive_model.relaxed_distribution(temperature=.5, conditional=conditional_samples)
x = tf.round(distribution.sample(batch_size))  # works only with sample(batch_size)
print(x)
print(distribution.prob(tf.clip_by_value(x, clip_value_min=1e-7, clip_value_max=1.-1e-7)))
discrete_distribution = autoregressive_model.discrete_distribution(conditional=conditional_samples)
print(discrete_distribution.sample(batch_size))
print(discrete_distribution.prob(x))

(3,)
tf.Tensor(
[[1. 0. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 0.]
 [1. 1. 1.]
 [1. 1. 0.]
 [1. 1. 1.]], shape=(8, 3), dtype=float32)
tf.Tensor(
[1.2603643e+10 2.7974690e+11 2.7974690e+11 2.7974690e+11 3.1810054e+10
 2.7974690e+11 3.1810054e+10 2.7974690e+11], shape=(8,), dtype=float32)
tf.Tensor(
[[1 0 0]
 [1 1 1]
 [1 1 1]
 [1 0 1]
 [1 1 1]
 [0 1 1]
 [1 0 1]
 [1 1 0]], shape=(8, 3), dtype=int32)
tf.Tensor(
[0.10981445 0.5398283  0.5398283  0.5398283  0.17404315 0.5398283
 0.17404312 0.53982824], shape=(8,), dtype=float32)


### Masked autoregressive flow for generating relaxed Bernoulli (via bijector_fn instead of shift_and_log_scale_fn)

In [2]:
batch_size=8
event_shape=(3, )
cond_shape=(5, )
temperature = 0.5

scale = 3.
softclip = tfb.SoftClip(low=-scale, high=scale)

class MaskedAutoRegressiveBernoulli(tfk.Model):
    
    def __init__(self, event_shape, cond_shape=None):
        
        x = tfk.Input(shape=event_shape, dtype=tf.float32)
        if cond_shape is not None:
            y = tfk.Input(shape=cond_shape, dtype=tf.float32)
        else:
            y = None
        
        _made = tfb.AutoregressiveNetwork(
            params=1,
            hidden_units=[128, 128],
            event_shape=event_shape,
            conditional=cond_shape is not None,
            conditional_event_shape=cond_shape,
            activation=tfb.Softplus())
        made = _made(x, conditional_input=y)
        made = tfkl.Lambda(
            lambda x: softclip(x[..., 0])
        )(made)
        
        super(MaskedAutoRegressiveBernoulli, self).__init__(inputs=x if y is None else [x, y], outputs=made)
        self.event_shape = event_shape

    @staticmethod
    def _process_made_inputs(x, conditional = None, *args, **kwargs):
        return [x, conditional] if conditional is not None else x

    def relaxed_distribution(self, temperature, conditional=None):
        
        def bijector_fn(x) -> tfb.Bijector:
            inputs = self._process_made_inputs(x, conditional)
            shift = self(inputs) / temperature
            return tfb.Chain([tfb.Sigmoid(), tfb.Shift(shift)])
        
        return tfd.TransformedDistribution(
            distribution=tfd.Sample(
                tfd.Logistic(loc=0., scale=1. / temperature), sample_shape=self.event_shape),
            bijector=tfb.MaskedAutoregressiveFlow(bijector_fn=bijector_fn))
    
    def discrete_distribution(self, conditional=None):
        
        def distribution_fn(x=None):
            if x is None:
                logits = tf.zeros(event_shape)
            else:
                inputs = self._process_made_inputs(x, conditional)
                logits = self(inputs)
                
            return tfd.Independent(
                distribution=tfd.Bernoulli(logits=logits),
                reinterpreted_batch_ndims=1)

        return tfd.Autoregressive(distribution_fn)

2022-01-19 18:05:04.137086: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
event_shape=(3, )
batch_size = 2

autoregressive_model = MaskedAutoRegressiveBernoulli(event_shape=event_shape)
distribution = autoregressive_model.relaxed_distribution(temperature=.5)
x = tf.round(distribution.sample())
print("rounded relaxed sample", x)
print("logistic prob with clipped sample",
      distribution.prob(tf.clip_by_value(x, clip_value_min=1e-7, clip_value_max=1.-1e-7)))
discrete_distribution = autoregressive_model.discrete_distribution()
print("discrete sample", discrete_distribution.sample())
print("discrete prob", discrete_distribution.prob(x))
print("relaxed sample", autoregressive_model.relaxed_distribution(temperature=.1).sample())

rounded relaxed sample tf.Tensor([0. 0. 0.], shape=(3,), dtype=float32)
logistic prob with clipped sample tf.Tensor(2416530000.0, shape=(), dtype=float32)
discrete sample tf.Tensor([0 0 1], shape=(3,), dtype=int32)
discrete prob tf.Tensor(0.09348528, shape=(), dtype=float32)
relaxed sample tf.Tensor([9.9999845e-01 9.9999738e-01 2.2329972e-15], shape=(3,), dtype=float32)


With conditionals

In [4]:
event_shape=(3, )
cond_shape=(5, )
batch_size = 4
temperature = 1e-5


autoregressive_model = MaskedAutoRegressiveBernoulli(event_shape=event_shape, cond_shape=cond_shape)
conditional_samples = tf.random.uniform((batch_size, ) + cond_shape)
print("conditional samples", conditional_samples)
distribution = autoregressive_model.relaxed_distribution(
    temperature=temperature, conditional=conditional_samples)
x = tf.round(distribution.sample())
print("rounded relaxed sample", x)
print("logistic prob with clipped sample",
      distribution.prob(tf.clip_by_value(x, clip_value_min=1e-7, clip_value_max=1.-1e-7)))
discrete_distribution = autoregressive_model.discrete_distribution(conditional=conditional_samples)
# Important: to sample from the discrete distribution, we need
# provide the batch size of the sample manually so that it matches the batch size of the conditional event
print("discrete sample", discrete_distribution.sample(batch_size))
print("discrete prob", discrete_distribution.prob(x))
print("relaxed sample", 
      autoregressive_model.relaxed_distribution(
          temperature=temperature, conditional=conditional_samples
      ).sample())

conditional samples tf.Tensor(
[[0.28799808 0.9021977  0.7689476  0.07662714 0.27008283]
 [0.11628854 0.22444272 0.30513394 0.14792514 0.4099145 ]
 [0.36682987 0.07588935 0.4018519  0.79595053 0.877362  ]
 [0.4473132  0.47209036 0.7915766  0.5921581  0.81145406]], shape=(4, 5), dtype=float32)
rounded relaxed sample tf.Tensor(
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]], shape=(4, 3), dtype=float32)
logistic prob with clipped sample tf.Tensor([14563.715 14587.958 11862.641 11656.918], shape=(4,), dtype=float32)
discrete sample tf.Tensor(
[[0 1 0]
 [1 0 1]
 [0 1 0]
 [1 1 0]], shape=(4, 3), dtype=int32)
discrete prob tf.Tensor([0.16888657 0.17486104 0.16713713 0.17505668], shape=(4,), dtype=float32)
relaxed sample tf.Tensor(
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]], shape=(4, 3), dtype=float32)


In [14]:
autoregressive_model.layers[2].layers

AttributeError: 'AutoregressiveNetwork' object has no attribute 'layers'

=========================================

In [199]:
hidden_layers = []
activation = None
for layer in _model.layers:
    hidden_layers.append(layer.units)
    if activation != layer.activation:
        activation = layer.activation
    
print(hidden_layers)
print(activation)

[64, 64]
<function <lambda> at 0x7f894bf1add0>


In [203]:
tfb.Sigmoid()([0.1, 0.2, 0.4, 0.5])

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.5249792 , 0.549834  , 0.59868765, 0.6224593 ], dtype=float32)>

In [204]:
tf.nn.sigmoid([0.1, 0.2, 0.4, 0.5])

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.5249792 , 0.549834  , 0.59868765, 0.6224593 ], dtype=float32)>

In [233]:
tfd.Independent(
    tfd.RelaxedBernoulli(logits=[0.1, 0.2, 0.4, 0.5], temperature=1./2.),
    reinterpreted_batch_ndims=1
).prob([0.5, 0.9, 0.1, 0.9])

<tf.Tensor: shape=(), dtype=float32, numpy=0.60080117>

In [235]:
tfd.Independent(
    tfd.TransformedDistribution(
        distribution=tfd.Logistic(loc=tf.constant([0.1, 0.2, 0.4, 0.5]) / 0.5, scale=2.),
        bijector=tfb.Sigmoid()),
    reinterpreted_batch_ndims=1,
).prob([0.5, 0.9, 0.1, 0.9])

<tf.Tensor: shape=(), dtype=float32, numpy=0.60080075>

In [242]:
tfd.Blockwise([
    tfd.Independent(
        tfd.TransformedDistribution(
            distribution=tfd.Logistic(loc=tf.constant([0.1, 0.2, 0.4, 0.5]) / 0.5, scale=2.),
            bijector=tfb.Sigmoid()),
        reinterpreted_batch_ndims=1,
    ),
    tfd.Bernoulli(logits=10., dtype=tf.float32)
]).sample()

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([0.8827684 , 0.9125265 , 0.9985579 , 0.06163964, 1.        ],
      dtype=float32)>

In [292]:
batch_size = 8
event_shape = (3, )

autoregressor = tfb.AutoregressiveNetwork(
    params=1,
    event_shape=event_shape,
    hidden_units=[64, 64],
    activation='relu')


def distribution(x):    
    return tfd.Autoregressive(
        lambda y: tfd.Independent(
            tfd.TransformedDistribution(
                distribution=tfd.Logistic(
                    loc=tf.unstack(autoregressor(y), axis=-1)[0] / 0.5,
                    scale=2.),
                bijector=tfb.Sigmoid()),
            reinterpreted_batch_ndims=1),
        sample0=x)

model = tfk.Sequential([
    tfk.Input(shape=event_shape),
    tfkl.Dense(units=np.prod(event_shape), activation='relu'),
    tfpl.DistributionLambda(make_distribution_fn=distribution)
])

model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss= lambda y, model: -model.log_prob(y))
                     
model.fit(
    x=tf.zeros(shape=(dataset_size,), dtype=tf.float32),  # always provide the same input
    y=dataset,
    batch_size=batch_size,
    epochs=1,
    steps_per_epoch=dataset_size // batch_size,
    shuffle=True,
    verbose=True)

bernoulli = distribution(tf.zeros(1, 3, dtype=tf.float32))
tf.print("P(s_1 | s_0) =", distribution.prob(states[1]))
tf.print("P(s_2 | s_0) =", distribution.prob(states[2]))
tf.print("P(s_3 | s_0) =", distribution.prob(states[3]))



ValueError: 
The following Variables were created within a Lambda layer (distribution_lambda_20)
but are not tracked by said layer:
  <tf.Variable 'dense/kernel:0' shape=(3, 64) dtype=float32>
  <tf.Variable 'dense/bias:0' shape=(64,) dtype=float32>
  <tf.Variable 'dense_1/kernel:0' shape=(64, 64) dtype=float32>
  <tf.Variable 'dense_1/bias:0' shape=(64,) dtype=float32>
  <tf.Variable 'dense_2/kernel:0' shape=(64, 3) dtype=float32>
  <tf.Variable 'dense_2/bias:0' shape=(3,) dtype=float32>
  <tf.Variable 'distribution_lambda_20/Autoregressive/autoregressive_network_64/Variable:0' shape=() dtype=int64>
  <tf.Variable 'distribution_lambda_20/Autoregressive/autoregressive_network_64/Variable:0' shape=() dtype=int64>
  <tf.Variable 'distribution_lambda_20/Autoregressive/autoregressive_network_64/Variable:0' shape=() dtype=int64>
The layer cannot safely ensure proper Variable reuse across multiple
calls, and consquently this behavior is disallowed for safety. Lambda
layers are not well suited to stateful computation; instead, writing a
subclassed Layer is the recommend way to define layers with
Variables.

In [307]:
batch_size = 8
event_shape = (3, )

autoregressor = tfb.AutoregressiveNetwork(
    params=1,
    event_shape=event_shape,
    hidden_units=[64, 64],
    activation='relu')

distribution = tfd.Autoregressive(
    lambda x: tfd.Independent(
        tfd.RelaxedBernoulli(
            logits=tf.unstack(autoregressor(x), axis=-1)[0],
            temperature=1./2.),
        reinterpreted_batch_ndims=1),
    sample0=tf.zeros(shape=(batch_size,) + event_shape),
    validate_args=True)

x = tfk.Input(shape=event_shape)
log_prob = distribution.log_prob(x)
model = tfk.Model(x, log_prob)

model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=1e-2),
    loss= lambda _, log_prob: -log_prob)

model.fit(
    x=dataset,
    y=tf.zeros(shape=(dataset_size, )), # log_prob = 0 <=> prob = 1
    batch_size=batch_size,
    epochs=4,
    steps_per_epoch=dataset_size // batch_size,
    shuffle=True,
    verbose=True)

bernoulli = distribution
tf.print("P(s_1 | s_0) =", distribution.prob(states[1]))
tf.print("P(s_2 | s_0) =", distribution.prob(states[2]))
tf.print("P(s_3 | s_0) =", distribution.prob(states[3]))

TypeError: __init__() got an unexpected keyword argument 'event_shape'

In [323]:
distribution.sample(12)

<tf.Tensor: shape=(12, 8, 3), dtype=float32, numpy=
array([[[3.79536152e-02, 2.71249473e-01, 8.78530264e-01],
        [6.35766685e-01, 8.96939754e-01, 9.45525587e-01],
        [7.10226595e-02, 3.24407458e-01, 6.88435137e-01],
        [4.26471233e-04, 9.17298079e-01, 1.70042813e-02],
        [6.73601031e-03, 1.08616352e-02, 7.72562146e-01],
        [6.22768402e-01, 5.70048451e-01, 9.32394862e-01],
        [1.93144381e-02, 9.84224319e-01, 2.31177509e-02],
        [4.43371236e-02, 4.78261411e-02, 1.75740272e-01]],

       [[8.53002369e-02, 9.80932474e-01, 2.81270206e-01],
        [2.99131423e-01, 1.40638530e-01, 3.35832477e-01],
        [4.45834994e-02, 2.84981728e-02, 4.51387763e-02],
        [2.01447934e-01, 9.64342475e-01, 5.64982355e-01],
        [8.25147331e-01, 5.42085230e-01, 9.80766177e-01],
        [6.86310828e-01, 9.99991894e-01, 9.98379588e-01],
        [1.03266448e-01, 4.05542761e-01, 6.59287333e-01],
        [9.17372108e-01, 9.98853326e-01, 3.90919894e-01]],

       [[9.12598

In [295]:
model.summary()

Model: "sequential_42"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_184 (Dense)            (8, 3)                    6         
_________________________________________________________________
distribution_lambda_19 (Dist multiple                  0         
Total params: 6
Trainable params: 6
Non-trainable params: 0
_________________________________________________________________


In [314]:
latent_state = tfk.Input(shape=(17,))
latent_action = tfk.Input(shape=(3,))

In [315]:
tf.shape(latent_state) + tf.shape(latent_action)

<KerasTensor: shape=(2,) dtype=int32 inferred_value=[None, None] (created by layer 'tf.__operators__.add_50')>

In [321]:
tfkl.Concatenate()([latent_state, latent_action]).shape[1:]

TensorShape([20])

In [326]:
tfp.__version__

'0.15.0'