In [None]:
import random
import math
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import scipy
from scipy.optimize import minimize
from scipy.stats import gamma, norm
import seaborn as sns
import time

from functools import partial

  import pandas.util.testing as tm


In [None]:
N = int(1e5)  # MC iterations
def _r(x): return np.round(x, 2)
def _CI(mu, std, n=1, z=1): return mu + (z/np.sqrt(n)) * np.array([-std, std])

# Importance Sampling

Given an expectation of the form $\alpha = E[h(X)]$, where $X$ is an $R^d$-valued rv with density $f_X(·)$ and $h:R^d \rightarrow R$, we can express $\alpha$ as:
$$\alpha = \int_{R^d} h(x) f_X (x) dx.$$
If $g(·)$ is another pdf on $R^d$ for which
$$g(x) > 0 \ \text{whenever} \ h(x) f_X (x) > 0$$
we can write:
$$\alpha = \int_{R^d} h(x) \frac{f_X (x)}{g(x)} g(x) dx$$
$$\alpha = E_g h(x) \frac{f_X (x)}{g(x)}$$
i.e. the expectation under which X had pdf g! \\
Consequently, we can estimate $\alpha$ by generating iid copies of the rv $h(x) \frac{f_X (x)}{g(x)}$, where X now has pdf g. Note that the ratio
$$\frac{f_X (x)}{g(x)}$$
is the relative likelihood of observing outcome $X$ under the nomila pdf $f_X(·)$ to that under the new density g. As a result, this rv is often called the **likelihood ratio**. By choosing g so that it samples more important regions of $R^d$ more frequently than does $f_X$, we can obtain a variance reduction.

### Application 1: sampling from rare regions of multivariate normal distribution.
Suppose that we wish to compute the probability that a bivariate normal rv with mean vector 0 and covariance matrix C lies in the quadrant with lower lefthand vertex at (3, 3). C is given in the code.
This is a rare event because the correlation $\rho = -0.7$, thus it is very unlikely that both are in the same quadrant which is already unlikely per se.

a) Naive approach: Compute a Monte Carlo estimator for the probability based on n = 1000, 105, and 106 trials, as well
as associated approximate 95% confidence intervals

In [None]:
mu = [0,0]
cov = [[1, -0.7], [-0.7, 1]]
# for n in [1000, 10**5, 10**6]:
n = N
x = np.random.multivariate_normal(mu, cov, n)
occurrences = np.all(x > 3, axis=1)
p = sum(occurrences) / n
sigma = np.sqrt(sum((occurrences - p)**2) / (n-1))
deviation = 1.96 * sigma / np.sqrt(n)
CI = p + np.array([-deviation,deviation])
print("Size:", n)
print("Probability:", p, "| CI:", CI)

Size: 100000
Probability: 0.0 | CI: [0. 0.]


b)

We choose as $g(x)$ a multivariate normal with $\mu = [3.5,3.5]$ and same covariance matrix as $f(x)$; $g(x)$ will be very likely to yield values that occur in the quadrant of interest and will thus allow us to estimate the probability.

In [None]:
mu_f = np.zeros(2)
mu_g = np.array([3.5,3.5])
cov = [[1, -0.7], [-0.7, 1]]  # same covariance matrix

def mvnormal_pdf(x, mu, cov):
  centered = np.array(x - mu)
  # no need to compute the determinant because it'll cancel out when taking the ratio.
  return np.exp(-0.5*(centered.T @ np.linalg.inv(cov) @ centered))

# for n in [1000, 10**5, 10**6]:
n = N
x_f = np.random.multivariate_normal(mu_f, cov, n)
x_g = np.random.multivariate_normal(mu_g, cov, n)
# Check if x_g is in quadrant
X_g = np.all(x_g > 3, axis=1)
# Compute probability from pdf
fx = np.array([mvnormal_pdf(x_g[i], mu_f, cov) for i in range(n)])
gx = np.array([mvnormal_pdf(x_g[i], mu_g, cov) for i in range(n)])
X_f = X_g * fx / gx

p = np.mean(X_f)
sigma = np.sqrt(sum((X_f - p)**2) / (n-1))
deviation = 1.96 * sigma / np.sqrt(n)
CI = p + np.array([-deviation,deviation])
print("Size", n)
print("Probability", p, "| CI:", CI)

Size 100000
Probability 2.064581003302526e-16 | CI: [1.86867657e-16 2.26048544e-16]



Without importance sampling, Monte Carlo simply estimates that it never happens because it is so rare. Using an appropriately centered distribution for importance sampling , we arrive at good estimates for the probability at hand. \\
Let (X,Y) ∈ N(0,C). Then good choices for importance sampling distributions have the
property that they are good approximations of the conditional distribution (X, Y )|{X ≥ 3, Y ≥ 3}. Thus, a good choice for the importance sampling distribution has most of its mass in the lower-right corner of the quadrant x ≥ 3,y ≥ 3.