# Bayesian A/B testing with TensorFlow Probability

Source: https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/blob/master/Chapter2_MorePyMC/Ch2_MorePyMC_TFP.ipynb

In case migration is needed from TensorFlow 1.xx to TensorFlow 2: https://www.tensorflow.org/guide/migrate

In [None]:
import tensorflow as tf
import tensorflow_probability as tfp
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go

init_notebook_mode(connected=True)

tfd = tfp.distributions
tfb = tfp.bijectors

In [None]:
rv_p = tfd.Uniform(low=0., high=1., name="p")

In [None]:
rv_p.sample()

In [None]:
prob_true = 0.05
n_samples = 1500

occurrences = tfd.Bernoulli(probs=prob_true).sample(sample_shape=n_samples)

print(f"True frequency (probability): {prob_true}")
print(f"Observed frequency: {tf.reduce_sum(occurrences) / n_samples}")

Define a log prob for the product of the likelihood times the posterior for a value for the probability of an occurrence and samples of occurrences. In our case we take
$$
\begin{array}{l}
p_a \sim p(p_a) = \text{Uniform}(0, 1) \\
X \sim p(x | p_a) = \text{Ber}(x | p_a)\,.
\end{array}
$$

The data $\mathcal{D} = \left\lbrace x_1, \ldots, x_N \right\rbrace$ is assumed to be i.i.d., so the likelihood is the product of the probability of each value,
$$
p\left( \mathcal{D} | p_a \right) = \prod_{i=1}^{N} p(x_i | p_a) = \prod_{i=1}^{N} \text{Ber}(x_i | p_a)
$$

so the posterior probability distribution $p\left( p_a | \mathcal{D} \right)$ of $p_a$ given the data is
$$
p\left( p_a | \mathcal{D} \right) \simeq p\left( \mathcal{D} | p_a \right)\, p(p_a) = \prod_{i=1}^{N} \text{Ber}(x_i | p_a)\, p(p_a),
$$
where the approximation lies in the fact that we are ignoring the evidence term $p\left(\mathcal{D}\right)$ that should be at the denominator of the RHS(s).

The MCMC uses the log of the joint probability above,
$$
\log\left( p\left( p_a | \mathcal{D} \right) \right) = \log\left( p(p_a) \right) + \sum_{i=1}^N \log\left( \text{Ber}(x_i | p_a) \right)\,,
$$

and the logic is, at each step of the chain, to evaluate it on the new proposed values for $p_a$ keeping the occurrences $\left\lbrace x_i \right\rbrace_i$ fixed and use the resulting value to compute the probability of acceptance of the new value for $p_a$.

TFP allows to define random variables for $p(p_a)$ and $p(x | p_a)$, which expose the `log_prob()` method adding which it is possible to define the joint log probabilty above.

In [None]:
def joint_log_prob(occurrences, prob_a):
    """
    """
    # Prior for probability of an occurrence.
    rv_prob_a = tfd.Uniform(low=0., high=1.)
    
    # Likelihood for the occurrences given probability
    # prob_a for an occurrence.
    rv_occurrences = tfd.Bernoulli(probs=prob_a)
    
    # Log probability of prior * likelihood. Likelihood is
    # decomposed in the product of the probabilities of each
    # occurrence (i.i.d. hypothesis).
    return rv_prob_a.log_prob(prob_a) + tf.reduce_sum(rv_occurrences.log_prob(occurrences))

In [None]:
number_of_steps = 48000
burnin = 25000 
leapfrog_steps=2

# Set the chain's start state.
initial_chain_state = [
    tf.reduce_mean(tf.dtypes.cast(occurrences, dtype=tf.float32))
    * tf.ones([], dtype=tf.float32, name="init_prob_A")
]

# Since HMC operates over unconstrained space, we need to transform the
# samples so they live in real-space.
unconstraining_bijectors = [
    tfp.bijectors.Identity()   # Maps R to R.  
]

# Define a closure over our joint_log_prob.
# The closure makes it so the HMC doesn't try to change the `occurrences` but
# instead determines the distributions of other parameters that might generate
# the `occurrences` we observed.
unnormalized_posterior_log_prob = lambda *args: joint_log_prob(occurrences, *args)

# Initialize the step_size. (It will be automatically adapted.)
# Old TensorFlow 1.xx code.
# with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
#     step_size = tf.get_variable(
#         name='step_size',
#         initializer=tf.constant(0.5, dtype=tf.float32),
#         trainable=False,
#         use_resource=True
#     )

# Older TFP version code.
# hmc = tfp.mcmc.TransformedTransitionKernel(
#     inner_kernel=tfp.mcmc.HamiltonianMonteCarlo(
#         target_log_prob_fn=unnormalized_posterior_log_prob,
#         num_leapfrog_steps=leapfrog_steps,
#         step_size=step_size,
#         step_size_update_fn=tfp.mcmc.make_simple_step_size_update_policy(num_adaptation_steps=int(burnin * 0.8)),
#         state_gradients_are_stopped=True
#     ),
#     bijector=unconstraining_bijectors
# )

# When using bijectors and an step size update policy (now available as
# a kernel instead of a function), the sequence (from inner to outer
# kernel) is:
# mcmc engine (HMC) kernel --> bijectors kernel --> step size adaptation kernel.
hmc = tfp.mcmc.HamiltonianMonteCarlo(
        target_log_prob_fn=unnormalized_posterior_log_prob,
        num_leapfrog_steps=leapfrog_steps,
        step_size=tf.constant(0.01, dtype=tf.float32)
    )

hmc = realnvp_hmc = tfp.mcmc.TransformedTransitionKernel(
    inner_kernel=hmc,
    bijector=tfb.Identity()
)

hmc = tfp.mcmc.SimpleStepSizeAdaptation(
    inner_kernel=hmc,
    num_adaptation_steps=int(burnin * 0.8)
)

Warnings from the commented version:

```WARNING:tensorflow:From <ipython-input-132-c435a30efc4b>:44: make_simple_step_size_update_policy (from tensorflow_probability.python.mcmc.hmc) is deprecated and will be removed after 2019-05-22.
Instructions for updating:
Use tfp.mcmc.SimpleStepSizeAdaptation instead.```

```/Users/emanuele_moscato/anaconda3/envs/tf-test/lib/python3.7/site-packages/tensorflow_probability/python/mcmc/internal/util.py:494: UserWarning: `step_size` is not a `tf.Tensor`, Python number, or Numpy array. If this parameter is mutable (e.g., a `tf.Variable`), then the behavior implied by `store_parameters_in_results` will silently change on 2019-08-01. Please consult the docstring for `store_parameters_in_results` details and use `store_parameters_in_results=True` to silence this warning.
  param_name))```

In [None]:
posterior_prob_A_, (is_accepted, log_accept_ratio, inner_results) = tfp.mcmc.sample_chain(
    num_results=number_of_steps,
    num_burnin_steps=burnin,
    current_state=initial_chain_state,
    kernel=hmc,
    trace_fn=lambda _, pkr: (
        pkr.inner_results.inner_results.is_accepted,
        pkr.inner_results.inner_results.log_accept_ratio,
        pkr.inner_results
    )
)

# posterior_prob_A_, is_accepted, log_accept_ratio

In [None]:
tf.reduce_mean(posterior_prob_A_[0])

In [None]:
trace = go.Histogram(
    x=posterior_prob_A_[0].numpy(),
    histnorm="probability density"
)

fig = go.Figure(data=[trace])

iplot(fig)