In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

%matplotlib inline
import edward as ed
import matplotlib.pyplot as plt
import numpy as np
import pprint
import scipy
import tensorflow as tf

from edward.models import Bernoulli, Normal, MultivariateNormalTriL

In [2]:
help(MultivariateNormalTriL)

Help on class MultivariateNormalTriL in module abc:

class MultivariateNormalTriL(edward.models.random_variable.RandomVariable, tensorflow.contrib.distributions.python.ops.mvn_tril.MultivariateNormalTriL)
 |  The multivariate normal distribution on `R^k`.
 |  
 |  The Multivariate Normal distribution is defined over `R^k` and parameterized
 |  by a (batch of) length-`k` `loc` vector (aka "mu") and a (batch of) `k x k`
 |  `scale` matrix; `covariance = scale @ scale.T` where `@` denotes
 |  matrix-multiplication.
 |  
 |  #### Mathematical Details
 |  
 |  The probability density function (pdf) is,
 |  
 |  ```none
 |  pdf(x; loc, scale) = exp(-0.5 ||y||**2) / Z,
 |  y = inv(scale) @ (x - loc),
 |  Z = (2 pi)**(0.5 k) |det(scale)|,
 |  ```
 |  
 |  where:
 |  
 |  * `loc` is a vector in `R^k`,
 |  * `scale` is a matrix in `R^{k x k}`, `covariance = scale @ scale.T`,
 |  * `Z` denotes the normalization constant, and,
 |  * `||y||**2` denotes the squared Euclidean norm of `y`.
 |  
 |  A 

In [3]:
class FLAGS:
    N=1000   # Number of data points
    D=5     # Number of features


In [4]:
def build_toy_dataset(N, D, noise_std=1):
    X = np.random.uniform(-6, 6, size=(N, D))
    w = np.random.uniform(-1, 1, size=D)
    b = np.random.uniform(-4, 4)
    epsilon = np.random.normal(0, noise_std, size=N)
    y = (np.dot(X, w) + b + epsilon > 0).astype(int)
    # note this is actually generated from a probit model
    return X, y

In [5]:
ed.set_seed(42)

# DATA
X_train, y_train = build_toy_dataset(FLAGS.N, FLAGS.D)

# MODEL
X = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.D])
w = Normal(loc=tf.zeros(FLAGS.D), scale=tf.ones(FLAGS.D))
b = Normal(loc=tf.zeros([1]), scale=tf.ones([1]))
y = Bernoulli(logits=ed.dot(X, w) + b)

# INFERENCE
qb = Normal(
    loc=tf.Variable(tf.zeros([1])), 
    scale=tf.Variable(tf.ones([1])))  # should probably initialize to random values

w_init = np.random.randn(FLAGS.D)
print(w_init)

qw = MultivariateNormalTriL(
    loc=tf.Variable(tf.cast(w_init, tf.float32)),
    scale_tril=tf.Variable(tf.random_normal([FLAGS.D, FLAGS.D])))

# inference = ed.KLqp({w: qw, b: qb}, data={X: X_train, y: y_train})
inference = ed.Laplace({w: qw, b: qb}, data={X: X_train, y: y_train})
inference.initialize(n_print=10, n_iter=600)

inference.run()


[-0.73977116 -1.33461061 -0.38567263  0.43450265  0.24020584]


  not np.issubdtype(value.dtype, np.float) and \
  not np.issubdtype(value.dtype, np.int) and \


1000/1000 [100%] ██████████████████████████████ Elapsed: 1s | Loss: 152.222


In [6]:
if FLAGS.D == 1:
    n_posterior_samples = 10

    w_post = qw.sample(n_posterior_samples).eval()
    b_post = qb.sample(n_posterior_samples).eval()

    plt.rcParams["figure.figsize"] = (8,6)
    plt.scatter(X_train, y_train)

    inputs = np.linspace(-6, 6, num=400)
    for ns in range(n_posterior_samples):
        output = scipy.special.expit(np.dot(inputs[:,np.newaxis], w_post[ns]) + b_post[ns])
        plt.plot(inputs, output)

    plt.show()

In [7]:
# these give same result
qw.scale.to_dense().eval()
tf.cholesky(qw.covariance()).eval()

array([[ 0.04017381,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.00236594,  0.04017104,  0.        ,  0.        ,  0.        ],
       [-0.00231206, -0.00321833,  0.04667843,  0.        ,  0.        ],
       [ 0.00386958,  0.00074221, -0.00476353,  0.04382423,  0.        ],
       [-0.00027969, -0.00264103,  0.00227064, -0.00508947,  0.04496394]],
      dtype=float32)

In [8]:
# this is inverse of observed Fisher information, used in Laplace approximation
qw.covariance().eval()

array([[ 1.6139353e-03,  9.5049014e-05, -9.2884125e-05,  1.5545593e-04,
        -1.1236388e-05],
       [ 9.5049014e-05,  1.6193104e-03, -1.3475369e-04,  3.8970698e-05,
        -1.0675466e-04],
       [-9.2884125e-05, -1.3475369e-04,  2.1945788e-03, -2.3368954e-04,
         1.1513616e-04],
       [ 1.5545593e-04,  3.8970698e-05, -2.3368954e-04,  1.9587788e-03,
        -2.3690081e-04],
       [-1.1236388e-05, -1.0675466e-04,  1.1513615e-04, -2.3690083e-04,
         2.0598674e-03]], dtype=float32)

In [9]:
qw.mean().eval()

array([-0.29878226, -0.0199684 ,  0.92905945, -0.46812683,  1.0524466 ],
      dtype=float32)

In [10]:
qb.scale.eval()

array([0.02032956], dtype=float32)

In [11]:
qb.loc.eval()

array([-4.5841575], dtype=float32)

In [19]:
# verify Segall formula is the same
w_map = qw.mean().eval()
b_map = qb.loc.eval()
p = y.mean().eval(feed_dict={X: X_train, w: w_map, b: b_map})

In [20]:
W = np.diag(p * (1-p))

In [21]:
# note this is the inverse covariance for weighted least squares! The weights are just Bernoulli variances
hess_segall = np.matmul(np.matmul(X_train.T, W), X_train) / FLAGS.N

In [22]:
print(hess_segall)

[[ 0.61860367 -0.03649266  0.0281739  -0.05102919 -0.00548801]
 [-0.03649266  0.62083827  0.0410662  -0.00284558  0.0339018 ]
 [ 0.0281739   0.0410662   0.46217915  0.04716267 -0.01546445]
 [-0.05102919 -0.00284558  0.04716267  0.52936101  0.05716532]
 [-0.00548801  0.0339018  -0.01546445  0.05716532  0.50282823]]


In [23]:
obs_fisher = np.linalg.inv(qw.covariance().eval()) / FLAGS.N

In [24]:
print(obs_fisher)

[[ 0.6272852  -0.03436808  0.01965453 -0.04734807 -0.00490336]
 [-0.03436808  0.6245095   0.03511814 -0.00187936  0.02999928]
 [ 0.01965453  0.03511814  0.46504438  0.05102199 -0.01819845]
 [-0.04734807 -0.00187936  0.05102199  0.5273516   0.057442  ]
 [-0.00490336  0.02999928 -0.01819845  0.057442    0.49461964]]


In [18]:
# The expression Segall derived for the Fisher information is close enough 
# to the observed Fisher information computed using TF autodiff, available from Laplace approximation.
# What causes the discrepancy?
# Is the Edward Laplace approx really computed at the mode? We evaluate the Segall expression at the MAP estimate...

# Discrepancy increases if the number of samples is one order of magnitude more/less (not sure why)