In [None]:
%run notebook_config

# Gaussian processes

`caustic` enables the use of [Gaussian Processes](https://distill.pub/2019/visual-exploration-gaussian-processes/) for modeling correlated noise. The basic idea behind GPs is to extend the covariance matrix of the multivariate gaussian likelihood we've used in the previous tutorial and model the covariance matrix terms by means of a kernel function
which depends on the difference between any two time points at which we measured the flux. That is
   
$$\kappa(t)=\kappa \,(|t-t'|)$$

Such a kernel is said to be stationary because it defines a stationary gaussian process. The covariance matrix terms are then

$$
\boldsymbol{\Sigma}_{i, j}=\kappa\left(|t_{i}- t_{j}|\right)+\sigma_{i}^{2} \delta_{i, j}
$$

where $\sigma_i$ are the "error bars" provided by photometry reduction pipelines. Because the covariance matrix is no longer diagonal, and the likelihood function involves computing its inverse and the determinant, naive implementations are extremely costly because the computation of a matrix inverse scales like $\mathcal{O}(N^3)$. Fortunately, the recent
package [celerité](https://celerite.readthedocs.io/en/stable/) enables computation of the gaussian process likelihood in linear time. It does this by restricting the application of GPs to one-dimensional data and a special class of kernel functions which result enable efficient computation of the inverse and the determinant of the covariance matrix. The restricted class of kernels is still sufficient for use in microlensing data. 

For more info on celerité, see the package documentation and the associated [paper](https://ui.adsabs.harvard.edu/abs/2017AJ....154..220F/abstract).

In [None]:
import numpy as np
from matplotlib import pyplot as plt

import pymc3 as pm
import theano.tensor as T

import caustic as ca
import exoplanet as xo

np.random.seed(42)

event_ogle = ca.data.OGLEData("../../data/OGLE-2017-BLG-0660")

fig, ax = plt.subplots(figsize=(10, 5))
event_ogle.plot_standardized_data(ax)

Let's first fit a model with a diagonal covariance matrix.

In [None]:
# Initialize a SingleLensModel object
model = ca.models.SingleLensModel(event_ogle)

In [None]:
n_bands = len(event_ogle.light_curves)
BoundedNormal = pm.Bound(pm.Normal, lower=0.0)
BoundedNormal_1 = pm.Bound(pm.Normal, lower=1.0)

with model:
    # Flux parameters
    Delta_F = BoundedNormal(
        "Delta_F",
        mu=T.zeros(n_bands),
        sd=50.0 * T.ones(n_bands),
        testval=5.0 * T.ones(n_bands),
        shape=(n_bands),
    )

    F_base = pm.Normal(
        "F_base",
        mu=T.zeros(n_bands),
        sd=0.6 * T.ones(n_bands),
        testval=T.zeros(n_bands),
        shape=(n_bands),
    )

    # Other parameters
    t_0 = pm.Uniform(
        "t_0", model.t_min, model.t_max, testval=ca.estimate_t0(event_ogle)
    )

    u_0 = BoundedNormal("u_0", mu=0.0, sd=1.5, testval=0.1)

    teff = BoundedNormal("t_eff", mu=0.0, sd=365.0, testval=20.0)

    # Deterministic transformations
    t_E = pm.Deterministic("t_E", teff / u_0)
    m_source, g = ca.compute_source_mag_and_blend_fraction(
        event_ogle, model, Delta_F, F_base, u_0
    )
    pm.Deterministic("m_source", m_source)
    pm.Deterministic("g", g)

    # Compute the trajectory of the lens
    trajectory = ca.trajectory.Trajectory(event_ogle, t_0, u_0, t_E)
    u = trajectory.compute_trajectory(model.t)

    # Compute the magnification
    mag = model.compute_magnification(u, u_0)

    # Compute the mean model
    mean = Delta_F * mag + F_base

    # Let's allow for rescaling of the error bars by a constant factor plus an additive term
    c_1 = BoundedNormal_1(
        "c_1",
        mu=T.ones(n_bands),
        sd=2.0 * T.ones(n_bands),
        testval=1.5 * T.ones(n_bands),
        shape=(n_bands),
    )

    c_2 = BoundedNormal(
        "c_2",
        mu=T.ones(n_bands),
        sd=1.0 * T.ones(n_bands),
        testval=0.1 * T.ones(n_bands),
        shape=(n_bands),
    )

    # Diagonal terms of the covariance matrix
    var_F = (c_1 * model.sig_F) ** 2 + c_2 ** 2

    # Compute the Gaussian log_likelihood, add it as a potential term to the model
    ll = model.compute_log_likelihood(model.F - mean, var_F)
    pm.Potential("log_likelihood", ll)

In [None]:
with model:
    # Print initial logps
    initial_logps = [RV.logp(model.test_point) for RV in model.basic_RVs]
    print("Initial values of log priors:", initial_logps)

    # Run sampling
    trace = pm.sample(
        tune=500, draws=1000, cores=4, step=xo.get_dense_nuts_step(target_accept=0.9)
    )

In [None]:
pm.summary(trace)

In [None]:
pm.plot_posterior(trace, figsize=(12, 12));

In [None]:
with model:
    # Create dense grid
    t_dense = np.tile(np.linspace(model.t_min, model.t_max, 2000), (n_bands, 1))
    t_dense_tensor = T.as_tensor_variable(t_dense)

    # Compute the trajectory of the lens
    u_dense = trajectory.compute_trajectory(t_dense_tensor)

    # Compute the magnification
    mag_dense = model.compute_magnification(u_dense, u_0)

    # Compute the mean model
    mean_dense = Delta_F * mag_dense + F_base

In [None]:
# Plot model
fig, ax = plt.subplots(
    2, 1, gridspec_kw={"height_ratios": [3, 1]}, figsize=(10, 5), sharex=True
)

ca.plot_model_and_residuals(
    ax, event_ogle, model, trace, t_dense_tensor, mean_dense, n_samples=50
)

We can see clear correlations in the residuals which aren't accounted for by the model. To expand the model we include a GP.

In [None]:
# Initialize a SingleLensModel object
model_gp = ca.models.SingleLensModel(event_ogle)

In [None]:
n_bands = len(event_ogle.light_curves)
BoundedNormal = pm.Bound(pm.Normal, lower=0.0)
BoundedNormal_1 = pm.Bound(pm.Normal, lower=1.0)

with model_gp:
    # Flux parameters
    Delta_F = BoundedNormal(
        "Delta_F",
        mu=T.zeros(n_bands),
        sd=50.0 * T.ones(n_bands),
        testval=5.0 * T.ones(n_bands),
        shape=(n_bands, 1),
    )

    F_base = pm.Normal(
        "F_base",
        mu=T.zeros(n_bands),
        sd=0.6 * T.ones(n_bands),
        testval=T.zeros(n_bands),
        shape=(n_bands),
    )

    # Other parameters
    t_0 = pm.Uniform(
        "t_0", model_gp.t_min, model_gp.t_max, testval=ca.estimate_t0(event_ogle)
    )

    u_0 = BoundedNormal("u_0", mu=0.0, sd=1.5, testval=0.1)

    teff = BoundedNormal("t_eff", mu=0.0, sd=365.0, testval=20.0)

    # Deterministic transformations
    t_E = pm.Deterministic("t_E", teff / u_0)
    m_source, g = ca.compute_source_mag_and_blend_fraction(
        event_ogle, model_gp, Delta_F, F_base, u_0
    )
    pm.Deterministic("m_source", m_source)
    pm.Deterministic("g", g)

    # Compute the trajectory of the lens
    trajectory = ca.trajectory.Trajectory(event_ogle, t_0, u_0, t_E)
    u = trajectory.compute_trajectory(model_gp.t)

    # Compute the magnification
    mag = model_gp.compute_magnification(u, u_0)

    # Compute the mean model
    mean = Delta_F * mag + F_base

    # Let's allow for rescaling of the error bars by a constant factor plus an additive term
    c_1 = BoundedNormal_1(
        "c_1",
        mu=T.ones(n_bands),
        sd=2.0 * T.ones(n_bands),
        testval=1.5 * T.ones(n_bands),
        shape=(n_bands),
    )

    c_2 = BoundedNormal(
        "c_2",
        mu=T.ones(n_bands),
        sd=1.0 * T.ones(n_bands),
        testval=0.1 * T.ones(n_bands),
        shape=(n_bands),
    )

    # Diagonal terms of the covariance matrix
    var_F = (c_1 * model_gp.sig_F) ** 2 + c_2 ** 2

We'll use the version of `celerite` implemented in the `exoplanet` code because it naturaly interfaces with `PyMC3` and provides gradient of the log likelihood with respect to the GP hyperparameters which is what we need for HMC to work. For more details check out the [exoplanet docs](https://exoplanet.dfm.io/en/latest/tutorials/gp/). We'll use the `exoplanet.gp.terms.Matern32` because it is a sensible default. This kernel is defined by two parameters, the characteristic lengthscale $\rho$ (in our case $\rho$ has dimensions of time since we're dealing with time series data), and $\sigma$ which controls the spread in the dependent variable (the flux). We have to be caref about choosing priors for the lengthscale parameter because GPs are somewhat prone to overfitting. Following the suggestions in this [Stan case study](https://betanalpha.github.io/assets/case_studies/gp_part1/part1.html), we opt to use an Inverse Gamma prior for $\rho$ which assigns 1\% probability to timescales less than the median separation between consecutive data points and 1\% probability to lengthscales larger than the entire duration of the time series. This is a sensible prior because it prevents the model from converging to timescales for which there is justification in the data. To compute the parameters of the Inverse Gamma distribution which satisfy the above requirements, we use the function `ca.compute_invgama_params`.


In [None]:
with model_gp:
    # Initialize the GP parameters for the Matern32 kernel
    sigma_gp = BoundedNormal(
        "sigma_gp",
        mu=T.zeros(n_bands),
        sd=3.0 * T.ones(n_bands),
        testval=0.5 * T.ones(n_bands),
        shape=(n_bands, 1),
    )

    rho_gp = BoundedNormal(
        "rho_gp",
        mu=T.zeros(n_bands),
        sd=100.0 * T.ones(n_bands),
        testval=2.0 * T.ones(n_bands),
        shape=(n_bands),
    )

    # List for storing xo.gp.GP objects
    gp_list = []

    for n in range(n_bands):
        kernel = xo.gp.terms.Matern32Term(sigma=sigma_gp[n], rho=rho_gp[n])
        gp_list.append(xo.gp.GP(kernel, model_gp.t[n], var_F[n], J=2))

    # Compute the Gaussian log_likelihood, add it as a potential term to the model
    ll = model_gp.compute_log_likelihood(model_gp.F - mean, var_F, gp_list)
    pm.Potential("log_likelihood", ll)

In [None]:
with model_gp:
    # Print initial logps
    initial_logps = [RV.logp(model_gp.test_point) for RV in model_gp.basic_RVs]
    print("Initial values of log priors:", initial_logps)

    # Run sampling
    trace_gp = pm.sample(
        tune=500, draws=2000, cores=4, step=xo.get_dense_nuts_step(target_accept=0.9)
    )

In [None]:
pm.summary(trace_gp)

In [None]:
pm.traceplot(trace_gp);

In [None]:
pm.plot_posterior(trace_gp, figsize=(12, 12));

In [None]:
pm.pairplot(
    trace_gp,
    figsize=(12, 10),
    var_names=["Delta_F", "F_base", "c_1", "t_0", "u_0", "t_eff", "rho_gp", "sigma_gp"],
);

Let's plot the model

In [None]:
with model_gp:
    # Create dense grid
    t_dense = np.tile(np.linspace(model_gp.t_min, model_gp.t_max, 1000), (n_bands, 1))
    t_dense_tensor = T.as_tensor_variable(t_dense)

    # Compute the trajectory of the lens
    u_dense = trajectory.compute_trajectory(t_dense_tensor)

    # Compute the magnification
    mag_dense = model_gp.compute_magnification(u_dense, u_0)

    # Compute the mean model
    mean_dense = Delta_F * mag_dense + F_base

This will take some time...

In [None]:
# Plot model
fig, ax = plt.subplots(
    2, 1, gridspec_kw={"height_ratios": [3, 1]}, figsize=(10, 8), sharex=True
)

ca.plot_model_and_residuals(
    ax,
    event_ogle,
    model_gp,
    trace_gp,
    t_dense_tensor,
    mean_dense,
    n_samples=50,
    gp_list=gp_list,
)

In [None]:
pm.plots.densityplot(
    [trace, trace_gp],
    point_estimate="median",
    figsize=(12, 12),
    data_labels=["white noise model", "gp model"],
);

In this case, the GP easily converged and there are no clear patterns in the residuals of the model. We also see substantial differences in the posteriors for the physical parameters, the point estimates are different and the variance of parameters is generally larger for the GP model.