# Linear regression with heteroskedasticity with neural networks and probabilistic layers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD, RMSprop
from tensorflow.keras.losses import mse
from sklearn.linear_model import LinearRegression

tfd = tfp.distributions
tfpl = tfp.layers

## Generate synthetic data

Generate synthetic data that is normally distributed around a straight line. We'll make this more interesting by using a joint distribution where everything is randomly sampled at the same time: the parameters of the distribution of the data (slope, intercept and the variance around the line) and the data itself - from one single joint distribution.

In [None]:
n_points = 180

x_data = tfd.Uniform(low=-10.5, high=23.).sample(n_points)
x_data = tf.reshape(x_data, (x_data.shape[0], 1))

joint_distr = tfd.JointDistributionSequential([
    tfd.Normal(loc=2.5, scale=3.),  # m
    tfd.Uniform(low=-5., high=12.),  # q
    tfd.TransformedDistribution(
        tfd.HalfNormal(scale=.5),
        tfp.bijectors.Shift(shift=10.)),  # sigma
    # Note 1: values in the sequence have to be passed to the lambda function
    # in the reverse order w.r.t. the one in which they appear.
    # Note 2: Independent is used so the final distribution has event_shape
    # (and not batch_shape) equal to the number of points, so that each time
    # we sample the distribution we get an entire dataset.
    lambda sigma, q, m: tfd.Independent(
        tfd.Normal(loc=x_data * m + q, scale=tf.abs(x_data) * sigma + 0.1),
        reinterpreted_batch_ndims=1)
])

# Sample the joint distribution.
distr, samples = joint_distr.sample_distributions()

m_sampled, q_sampled, sigma_sampled, y_data = samples

y_data = tf.reshape(y_data, (y_data.shape[0], 1))


# Plot samples.
fig = plt.figure(figsize=(14, 6))

sns.set_theme()

plt.scatter(
    x=x_data,
    y=y_data)

## Fit a linear regression to the data using a neural network (with one linear layer)

In [None]:
model = Sequential([
    Dense(input_shape=(1,), units=1)
])

In [None]:
model.count_params()

In [None]:
fig = plt.figure(figsize=(14, 6))

sns.set_theme()

plt.scatter(
    x=x_data,
    y=y_data,
    alpha=0.5,
    label='Data')

x_plot = tf.linspace(
    tf.reduce_min(x_data),
    tf.reduce_max(x_data),
    100)

plt.plot(
    x_plot.numpy(),
    model(tf.reshape(x_plot, (x_plot.shape[0], 1))).numpy(),
    label='Fit (before training!)',
    color='r'
)

plt.legend()

In [None]:
model.compile(
    optimizer=SGD(learning_rate=0.0005),
    loss=mse
)

In [None]:
history = model.fit(
    x=x_data,
    y=y_data,
    epochs=1000,
    verbose=0
)

In [None]:
fig = plt.figure(figsize=(14, 6))

sns.set_theme()

plt.plot(
    range(len(history.history['loss'])),
    history.history['loss'],
    label='Fit (before training!)',
    color='r'
)

plt.legend()
plt.title('Loss function history along epochs', fontsize=14)
plt.xlabel('Epoch')
plt.ylabel('Loss (mean square error)')

In [None]:
fig = plt.figure(figsize=(14, 6))

sns.set_theme()

plt.scatter(
    x=x_data,
    y=y_data,
    alpha=0.5,
    label='Data')

x_plot = tf.linspace(
    tf.reduce_min(x_data),
    tf.reduce_max(x_data),
    100)

plt.plot(
    x_plot.numpy(),
    model(tf.reshape(x_plot, (x_plot.shape[0], 1))).numpy(),
    label='Fit (after training)',
    color='r'
)

plt.legend()

Compare with a linear regression from Scikit-learn.

In [None]:
lr = LinearRegression(fit_intercept=True)

lr.fit(
    x_data.numpy().reshape(-1, 1),
    y_data)

In [None]:
model.get_layer(index=0).weights

In [None]:
lr.coef_, lr.intercept_

In [None]:
fig = plt.figure(figsize=(14, 6))

sns.set_theme()

plt.scatter(
    x=x_data,
    y=y_data,
    alpha=0.5,
    label='Data')

x_plot = tf.linspace(
    tf.reduce_min(x_data),
    tf.reduce_max(x_data),
    100)

plt.plot(
    x_plot.numpy(),
    model(tf.reshape(x_plot, (x_plot.shape[0], 1))).numpy(),
    label='Fit (neural network)',
    color='r'
)

plt.plot(
    x_plot.numpy(),
    lr.predict(tf.reshape(x_plot, (x_plot.shape[0], 1)).numpy()),
    label='Fit (linear regression)',
    color='green'
)

plt.legend()

## Fit with a neural network with a probabilistic layer at the end

Simple fit of sigma for the output Normal distribution.

In [None]:
probabilistic_model = Sequential([
    Dense(input_shape=(1, ), units=2),
    tfpl.DistributionLambda(
        make_distribution_fn=lambda t: tfd.Independent(
            tfd.Normal(loc=t[..., :1], scale=tf.math.softplus(t[..., 1:])),
            reinterpreted_batch_ndims=1),
        convert_to_tensor_fn=tfd.Distribution.sample
    )
])

In [None]:
# This generates a batch of distributions (one for each
# input datapoint).
probabilistic_model(x_data)

In [None]:
# This generates a tensor given by the chosen `convert_to_tensor_fn` (in
# this case we sample from the output batch of distributions).
probabilistic_model.predict(x_data)[:10]

In [None]:
def nll(y_true, y_pred):
    return -y_pred.log_prob(y_true)

In [None]:
# Make sure everything works as expected.
x_test = tf.reshape(x_data, (x_data.shape[0], 1))[:2]

# Target values.
y_test = y_data[:2]  # tf.reshape(y_data, (y_data.shape[0], 1))[:1]

# Output of the model evaluated on input data (a batch of
# distributions).
y_pred = probabilistic_model(x_test)

nll(
    y_test,
    probabilistic_model(x_test)
)

In [None]:
probabilistic_model.compile(
    optimizer=RMSprop(learning_rate=0.005),
    loss=nll
)

history = probabilistic_model.fit(
    x=x_data,
    y=y_data,
    epochs=500,
    verbose=0
)

In [None]:
fig = plt.figure(figsize=(14, 6))

sns.set_theme()

plt.plot(
    range(len(history.history['loss'])),
    history.history['loss'],
    label='Fit (before training!)',
    color='r'
)

plt.legend()
plt.title('Loss function history along epochs', fontsize=14)
plt.xlabel('Epoch')
plt.ylabel('Loss (mean square error)')

In [None]:
fig = plt.figure(figsize=(14, 6))

sns.set_theme()

plt.scatter(
    x=x_data,
    y=y_data,
    alpha=0.5,
    label='Data')

x_plot = tf.linspace(
    tf.reduce_min(x_data),
    tf.reduce_max(x_data),
    100)
x_plot = tf.reshape(x_plot, (x_plot.shape[0], 1))

plt.plot(
    x_plot.numpy(),
    model(x_plot).numpy(),
    label='Fit (neural network)',
    color='r'
)

plt.plot(
    x_plot.numpy(),
    probabilistic_model(x_plot).mean(),
    label='Fit (probabilistic neural network)',
    color='green'
)

plt.scatter(
    x_data.numpy(),
    probabilistic_model(x_data).sample(),
    label='Synthetic samples from probabilistic neural network',
    color='green',
    alpha=0.6
)

plt.legend()

More complicated fit of sigma.

**Note:** if we don't introduce any nonlinear activation function, the fit of the mean and standard deviation of the output batch of TFP distributions will unavoidably be linear. In order to have a more complicated model of heteroskedasticity we need to introduce some nonlinearity. Moreover, not all the nonlinear activation functions work as well: withoud altering the structure of the network, `relu` gives a good fit, while `tanh` and `sigmoid` don't. Notice however that in this case the data was synthetic and we knew in advance what we were looking for (standard deviations srhinking and then increasing again as $x$ increases), so there's a bias in what we deem "good".

In [None]:
# Define a more complicated model.
probabilistic_model_sigma = Sequential([
    Dense(input_shape=(1, ), units=4, activation='relu'),
    Dense(2),
    tfpl.DistributionLambda(
        make_distribution_fn=lambda t: tfd.Independent(
            tfd.Normal(loc=t[..., :1], scale=tf.math.softplus(t[..., 1:])),
            reinterpreted_batch_ndims=1
        )
    )
])

probabilistic_model_sigma.compile(
    optimizer=RMSprop(learning_rate=0.005),
    loss=nll
)

# Train model.
history = probabilistic_model_sigma.fit(
    x=x_data,
    y=y_data,
    epochs=500,
    verbose=0
)

# Plot loss function along the epochs.
fig = plt.figure(figsize=(14, 6))

sns.set_theme()

plt.plot(
    range(len(history.history['loss'])),
    history.history['loss'],
    label='Fit (before training!)',
    color='r'
)

plt.legend()
plt.title('Loss function history along epochs', fontsize=14)
plt.xlabel('Epoch')
plt.ylabel('Loss (mean square error)')

# Plot fit and samples from the output layer.
fig = plt.figure(figsize=(14, 6))

sns.set_theme()

plt.scatter(
    x=x_data,
    y=y_data,
    alpha=0.5,
    label='Data')

x_plot = tf.linspace(
    tf.reduce_min(x_data),
    tf.reduce_max(x_data),
    100)
x_plot = tf.reshape(x_plot, (x_plot.shape[0], 1))

plt.plot(
    x_plot.numpy(),
    model(x_plot).numpy(),
    label='Fit (neural network)',
    color='r'
)

y_plot = probabilistic_model_sigma(x_plot).mean()
y_high = y_plot + 2. * probabilistic_model_sigma(x_plot).parameters['distribution'].scale
y_low = y_plot - 2. * probabilistic_model_sigma(x_plot).parameters['distribution'].scale

plt.fill_between(
    x=x_plot.numpy().flatten(),
    y1=y_low.numpy().flatten(),
    y2=y_high.numpy().flatten(),
    color='green',
    alpha=0.2,
    label='mean $\pm 2\sigma$'
)

plt.plot(
    x_plot.numpy(),
    y_plot,
    label='Fit (probabilistic neural network)',
    color='green',
)

plt.scatter(
    x_data.numpy(),
    probabilistic_model_sigma(x_data).sample(),
    label='Synthetic samples from probabilistic neural network',
    color='green',
    alpha=0.6
)

plt.legend()