# Probabilistic deep learning for count data

In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

tfd = tfp.distributions

sns.set_theme()

## Load data

In [None]:
data_path = '../data/count_data_bayesian_nn/fish.csv'

data = pd.read_csv(data_path).sample(frac=1.).reset_index(drop=True)

data

In [None]:
n_training_samples = 200

features = ['nofish', 'livebait', 'camper', 'persons', 'child']
target = 'count'


x_train = tf.constant(data[features].iloc[:n_training_samples].values, dtype=tf.float32)
y_train = tf.constant(data[target].iloc[:n_training_samples].values, dtype=tf.float32)

x_test = tf.constant(data[features].iloc[n_training_samples:].values, dtype=tf.float32)
y_test = tf.constant(data[target].iloc[n_training_samples:].values, dtype=tf.float32)

## Define a probabilistic NN outputting Poisson distributions

In [None]:
model = Sequential([
    Dense(units=1, input_shape=(5,)),
    Activation('exponential'),
    tfp.layers.DistributionLambda(
        lambda t: tfd.Poisson(rate=t[..., -1])
    )
])

In [None]:
def nll(y_true, y_pred):
    """
    Negative log likelihood of the data w.r.t. to
    the predictions (probability distributions),
    to be used the loss function for model training.
    """
    return - y_pred.log_prob(y_true)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
    loss=nll,
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

In [None]:
model.summary()

In [None]:
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=300,
    verbose=0
)

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(14, 10), sharex=True)

sns.lineplot(
    x=range(len(history.history['loss'])),
    y=history.history['loss'],
    ax=axs[0]
)

plt.sca(axs[0])
plt.ylabel('Loss value')
plt.title('Training loss')

sns.lineplot(
    x=range(len(history.history['root_mean_squared_error'])),
    y=history.history['root_mean_squared_error'],
    ax=axs[1]
)

plt.sca(axs[1])
plt.ylabel('RMSE value')
plt.xlabel('Epoch')
plt.title('Training RMSE')

Acting with the model on a batch of datapoints generates a batch of Poisson distributions.

In [None]:
model(x_test)

Because we used the default value for the `convert_to_tensor_fn` option of the `DistributionLambda` layer, the model's `predict` method (which needs to generate tensors from distributions) corresponds to sampling the distributions.

In [None]:
model.predict(x_test)

RMSE on training and test data. Because the model outputs distributions that we can sample, we can generate a distribution of RMSE (each sample is a prediction for all components of the `x_test` input).

__Question:__ why is the test RMSE (distribution) smaller than the training one?

In [None]:
training_rmse_distr = tf.sqrt(tf.reduce_mean(tf.square(model(x_train).sample(10000) - y_train), axis=-1))
test_rmse_distr = tf.sqrt(tf.reduce_mean(tf.square(model(x_test).sample(10000) - y_test), axis=-1))

In [None]:
fig = plt.figure(figsize=(14, 6))

sns.histplot(
    x=test_rmse_distr.numpy(),
    stat='density',
    label='Test',
    color=sns.color_palette()[0]
)

sns.histplot(
    x=training_rmse_distr.numpy(),
    stat='density',
    label='Training',
    color=sns.color_palette()[1]
)

plt.xlabel('RMSE')
plt.title('Distribution of RMSE')
plt.legend()

Training and test loss (NLL) value.

In [None]:
print(
    'Training loss:',
    nll(y_train, model(x_train)).numpy().mean()
)

print(
    'Test loss:',
    nll(y_test, model(x_test)).numpy().mean()
)

Plot the true vs the predicted values, including the numerical computation of percentiles 2.5 and 97.5 (so as to have the 95% credibility interval, centered on the predicted value).

In [None]:
fig = plt.figure(figsize=(14, 6))

pred_distr_mean = model(x_test).mean()

sns.scatterplot(
    x=pred_distr_mean,
    y=y_test,
    color=sns.color_palette()[0],
    label='Data'
)

x_plot = tf.linspace(
    pred_distr_mean.numpy().min(),
    pred_distr_mean.numpy().max(),
    100
)

sns.lineplot(
    x=x_plot,
    y=x_plot,
    color=sns.color_palette()[1],
    label='actual = pred line'
)

# Percentiles.
unique_pred_distr_mean = np.sort(np.unique(pred_distr_mean.numpy()))

poisson_distr = tfd.Poisson(rate=unique_pred_distr_mean)

lower_percentiles = tfp.stats.percentile(
    poisson_distr.sample(10000), 2.5, axis=0
).numpy()[np.argsort(unique_pred_distr_mean)]

higher_percentiles = tfp.stats.percentile(
    poisson_distr.sample(10000), 97.5, axis=0
).numpy()[np.argsort(unique_pred_distr_mean)]

sns.lineplot(
    x=unique_pred_distr_mean,
    y=lower_percentiles,
    linestyle='dashed',
    color=sns.color_palette()[1]
)

sns.lineplot(
    x=unique_pred_distr_mean,
    y=higher_percentiles,
    linestyle='dashed',
    color=sns.color_palette()[1]
)

plt.xlabel('Predictions (distribution mean)')
plt.ylabel('Actual data')
plt.legend()

## Define a probabilistic NN outputting zero-inflated Poisson distributions

The zero-inflated Poisson distribution is useful when there are many more zeros than expected from a regular Poisson distribution. Its definition is:
- The value is 0 with probability $p$.
- The value is sampled from a Poisson distribution with rate $\lambda$ with probability $1 - p$.

In [None]:
def build_zero_inflated_poisson(input_tensor):
    """
    Given an input tensor, returns a mixture distribution made
    of a deterministic and a Poisson one.
    
    Note: the sigmoid and the exponential functions are applied
          so that the probabilities are in [0, 1] and the rates
          are non-negative.
    """
    # Build a tensor of probabilities for the mixture
    # from the input tensor.
    probs = tf.math.sigmoid(input_tensor[..., :1])
    probs = tf.concat([probs, 1. - probs], axis=-1)
    
    # Build a tensor of rates for the Poisson distribution
    # in the mixture.
    rates = tf.squeeze(tf.math.exp(input_tensor[..., 1:]))
        
    return tfd.Mixture(
        cat=tfd.Categorical(probs=probs),
        components=[
            tfd.Deterministic(loc=tf.zeros_like(rates)),
            tfd.Poisson(rate=rates)
        ]
    )

In [None]:
test_zip_params = tf.random.uniform(shape=(10, 2))

build_zero_inflated_poisson(test_zip_params)

In [None]:
build_zero_inflated_poisson(test_zip_params).sample(11)

Build model.

In [None]:
zip_model = Sequential([
    Dense(units=2),
    tfp.layers.DistributionLambda(
        lambda t: build_zero_inflated_poisson(t)
    )
])

In [None]:
zip_model.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
    loss=nll,
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

In [None]:
zip_model(x_train)

In [None]:
zip_model.summary()

In [None]:
zip_history = zip_model.fit(
    x=x_train,
    y=y_train,
    epochs=500,
    verbose=0
)

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(14, 10), sharex=True)

sns.lineplot(
    x=range(len(zip_history.history['loss'])),
    y=zip_history.history['loss'],
    ax=axs[0]
)

plt.sca(axs[0])
plt.ylabel('Loss value')
plt.title('Training loss')

sns.lineplot(
    x=range(len(zip_history.history['root_mean_squared_error'])),
    y=zip_history.history['root_mean_squared_error'],
    ax=axs[1]
)

plt.sca(axs[1])
plt.ylabel('RMSE value')
plt.xlabel('Epoch')
plt.title('Training RMSE')

Model evaluation.

In [None]:
zip_training_rmse_distr = tf.sqrt(tf.reduce_mean(tf.square(zip_model(x_train).sample(10000) - y_train), axis=-1))
zip_test_rmse_distr = tf.sqrt(tf.reduce_mean(tf.square(zip_model(x_test).sample(10000) - y_test), axis=-1))

In [None]:
fig = plt.figure(figsize=(14, 6))

sns.histplot(
    x=zip_test_rmse_distr.numpy(),
    stat='density',
    label='Test',
    color=sns.color_palette()[0]
)

sns.histplot(
    x=zip_training_rmse_distr.numpy(),
    stat='density',
    label='Training',
    color=sns.color_palette()[1]
)

plt.xlabel('RMSE')
plt.title('Distribution of RMSE')
plt.legend()

In [None]:
print(
    'Training loss:',
    nll(y_train, zip_model(x_train)).numpy().mean()
)

print(
    'Test loss:',
    nll(y_test, zip_model(x_test)).numpy().mean()
)

In [None]:
fig = plt.figure(figsize=(14, 6))

pred_distr_mean = zip_model(x_test).mean()

sns.scatterplot(
    x=pred_distr_mean,
    y=y_test,
    color=sns.color_palette()[0],
    label='Data'
)

x_plot = tf.linspace(
    pred_distr_mean.numpy().min(),
    pred_distr_mean.numpy().max(),
    100
)

sns.lineplot(
    x=x_plot,
    y=x_plot,
    color=sns.color_palette()[1],
    label='actual = pred line'
)

# Percentiles.
unique_pred_distr_mean = np.sort(np.unique(pred_distr_mean.numpy()))

poisson_distr = tfd.Poisson(rate=unique_pred_distr_mean)

lower_percentiles = tfp.stats.percentile(
    poisson_distr.sample(10000), 2.5, axis=0
).numpy()[np.argsort(unique_pred_distr_mean)]

higher_percentiles = tfp.stats.percentile(
    poisson_distr.sample(10000), 97.5, axis=0
).numpy()[np.argsort(unique_pred_distr_mean)]

sns.lineplot(
    x=unique_pred_distr_mean,
    y=lower_percentiles,
    linestyle='dashed',
    color=sns.color_palette()[1]
)

sns.lineplot(
    x=unique_pred_distr_mean,
    y=higher_percentiles,
    linestyle='dashed',
    color=sns.color_palette()[1]
)

plt.xlabel('Predictions (distribution mean)')
plt.ylabel('Actual data')
plt.legend()