In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from pathlib import Path
from modules._import_helper import safe_import_from

# Import variance reduction tools
ImportanceSampler = safe_import_from(
    '05_simulation_monte_carlo.src.variance_reduction',
    'ImportanceSampler'
)

# Setup
np.random.seed(42)
reports_dir = Path('../reports')
reports_dir.mkdir(exist_ok=True)

plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 5)
plt.rcParams['font.size'] = 11

## 1. Motivation: When Naive MC Struggles

**Problem:** Estimate $I = E_p[h(X)]$ where $h(X)$ is very small most of the time.

**Example:** Tail probability $P(X > 3)$ for $X \sim \mathcal{N}(0,1)$:
- True value: $P(X > 3) \approx 0.00135$
- Naive MC: Sample $X_1, ..., X_N \sim \mathcal{N}(0,1)$, estimate $\hat{p} = \frac{1}{N}\sum_{i=1}^N \mathbb{1}_{X_i > 3}$

**Problem:** With $N=10000$, we expect only ~13 hits. High variance!

$$\text{Var}[\hat{p}] = \frac{p(1-p)}{N} \approx \frac{0.00135}{N}$$

Standard error: $SE \approx 0.000367$ → **27% relative error**!

In [None]:
# Demonstration: Naive MC for tail probability
def naive_tail_estimate(threshold, n_samples, seed=42):
    """Estimate P(X > threshold) for X ~ N(0,1) using naive MC."""
    np.random.seed(seed)
    samples = np.random.randn(n_samples)
    hits = samples > threshold
    p_hat = np.mean(hits)
    
    # Standard error for Bernoulli
    se = np.sqrt(p_hat * (1 - p_hat) / n_samples) if p_hat > 0 else 0
    
    return p_hat, se, np.sum(hits)

threshold = 3.0
true_prob = 1 - stats.norm.cdf(threshold)
n = 10000

p_hat, se, n_hits = naive_tail_estimate(threshold, n)

print(f"True probability: {true_prob:.6f}")
print(f"Naive MC estimate: {p_hat:.6f} ± {se:.6f}")
print(f"Relative SE: {(se/true_prob)*100:.1f}%")
print(f"Number of hits: {n_hits}/{n}")
print(f"\n⚠ Problem: Very few samples in tail → high variance")

## 2. Importance Sampling: The Core Idea

**Key insight:** Sample from a **proposal distribution** $q(x)$ that concentrates mass where $h(x)$ is large!

**Derivation:**
$$I = \int h(x) p(x) dx = \int h(x) \frac{p(x)}{q(x)} q(x) dx = E_q\left[h(X) \cdot \frac{p(X)}{q(X)}\right]$$

**IS estimator:**
$$\hat{I}_{IS} = \frac{1}{N}\sum_{i=1}^N h(X_i) \cdot w(X_i), \quad X_i \sim q, \quad w(x) = \frac{p(x)}{q(x)}$$

**Why it works:**
- $\hat{I}_{IS}$ is **unbiased**: $E_q[\hat{I}_{IS}] = I$
- Variance can be **much smaller** if $q$ concentrates on important regions
- Optimal $q^*(x) \propto |h(x)|p(x)$ (zero variance if $h \geq 0$!)

**For tail probability:** Use $q = \mathcal{N}(\mu, 1)$ with $\mu$ near the threshold!

## 3. Implementing Importance Sampling

Let's estimate the same tail probability using IS:

In [None]:
# Setup: P(X > 3) for X ~ N(0, 1)
threshold = 3.0
n_samples = 10000

# Target function (indicator for tail event)
def target_func(x):
    return (x > threshold).astype(float)

# Proposal: N(threshold, 1) - shifted to tail
def proposal_sampler(n):
    return np.random.randn(n) + threshold

# Weight function: w(x) = p(x) / q(x) in log space for stability
def log_weight_func(x):
    log_p = stats.norm.logpdf(x, 0, 1)  # Target: N(0,1)
    log_q = stats.norm.logpdf(x, threshold, 1)  # Proposal: N(3,1)
    return log_p - log_q

# Run importance sampling
sampler = ImportanceSampler(
    proposal_sampler=proposal_sampler,
    weight_func=log_weight_func,
    seed=42
)

result = sampler.estimate(
    target_func=target_func,
    n_samples=n_samples
)

print(f"True probability: {true_prob:.6f}")
print(f"\nNaive MC: {p_hat:.6f} ± {se:.6f} (relative SE: {(se/true_prob)*100:.1f}%)")
print(f"Importance Sampling: {result.estimate:.6f} ± {result.std_error:.6f} (relative SE: {(result.std_error/true_prob)*100:.1f}%)")
print(f"\nVariance Reduction Factor: {result.variance_reduction_factor:.1f}x")
print(f"✓ IS is {result.variance_reduction_factor:.0f}x more efficient!")

**Observation:** IS gives ~20-50x variance reduction → equivalent to using 200,000-500,000 naive samples!

## 4. Visualizing the Difference

Let's see where naive MC vs IS concentrate their samples:

In [None]:
# Generate samples from both methods
np.random.seed(42)
n_vis = 5000

# Naive: sample from N(0,1)
samples_naive = np.random.randn(n_vis)

# IS: sample from N(3,1)
samples_is = np.random.randn(n_vis) + threshold

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Left: Histograms
x = np.linspace(-3, 7, 200)
ax1.hist(samples_naive, bins=50, alpha=0.5, density=True, label='Naive MC samples', color='blue')
ax1.hist(samples_is, bins=50, alpha=0.5, density=True, label='IS samples', color='orange')
ax1.plot(x, stats.norm.pdf(x, 0, 1), 'b-', linewidth=2, label='Target p(x) = N(0,1)')
ax1.plot(x, stats.norm.pdf(x, threshold, 1), 'orange', linewidth=2, label='Proposal q(x) = N(3,1)')
ax1.axvline(threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold = {threshold}')
ax1.fill_between(x[x>threshold], 0, stats.norm.pdf(x[x>threshold], 0, 1), 
                  alpha=0.3, color='red', label='Target region')
ax1.set_xlabel('x')
ax1.set_ylabel('Density')
ax1.set_title('Naive MC vs Importance Sampling')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Right: Convergence comparison
sample_sizes = np.logspace(2, 4, 20).astype(int)
errors_naive = []
errors_is = []

for n in sample_sizes:
    # Naive
    p_naive, _, _ = naive_tail_estimate(threshold, n, seed=42)
    errors_naive.append(abs(p_naive - true_prob))
    
    # IS
    sampler_temp = ImportanceSampler(proposal_sampler, log_weight_func, seed=42)
    r_is = sampler_temp.estimate(target_func, n)
    errors_is.append(abs(r_is.estimate - true_prob))

ax2.loglog(sample_sizes, errors_naive, 'o-', label='Naive MC', linewidth=2)
ax2.loglog(sample_sizes, errors_is, 's-', label='Importance Sampling', linewidth=2)
ax2.set_xlabel('Number of samples (N)')
ax2.set_ylabel('Absolute error')
ax2.set_title('Convergence: Naive vs IS')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / '02_importance_sampling_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Plot saved to reports/02_importance_sampling_comparison.png")

**Key insight:** IS samples concentrate in the tail (orange) where the event happens, while naive MC wastes most samples in irrelevant regions!

## 5. Sensitivity to Proposal Choice

**Critical question:** What if we choose a *bad* proposal?

Let's compare three proposals:
1. **Good:** $q_1 = \mathcal{N}(3, 1)$ - centered on threshold
2. **Mediocre:** $q_2 = \mathcal{N}(2, 1)$ - partially shifted
3. **Bad:** $q_3 = \mathcal{N}(0, 2)$ - too diffuse, wrong center

In [None]:
# Proposal 1: Good (already implemented above)
def proposal_good(n):
    return np.random.randn(n) + 3

def log_weight_good(x):
    return stats.norm.logpdf(x, 0, 1) - stats.norm.logpdf(x, 3, 1)

# Proposal 2: Mediocre
def proposal_mediocre(n):
    return np.random.randn(n) + 2

def log_weight_mediocre(x):
    return stats.norm.logpdf(x, 0, 1) - stats.norm.logpdf(x, 2, 1)

# Proposal 3: Bad
def proposal_bad(n):
    return np.random.randn(n) * 2  # N(0, 4) - wrong center

def log_weight_bad(x):
    return stats.norm.logpdf(x, 0, 1) - stats.norm.logpdf(x, 0, 2)

# Test all three
proposals = [
    ("Good (μ=3, σ=1)", proposal_good, log_weight_good),
    ("Mediocre (μ=2, σ=1)", proposal_mediocre, log_weight_mediocre),
    ("Bad (μ=0, σ=2)", proposal_bad, log_weight_bad),
]

print(f"Target: P(X > 3) = {true_prob:.6f}\n")
results_comparison = []

for name, prop_sampler, log_weight in proposals:
    sampler = ImportanceSampler(prop_sampler, log_weight, seed=42)
    result = sampler.estimate(target_func, 10000)
    results_comparison.append((name, result))
    
    print(f"{name}:")
    print(f"  Estimate: {result.estimate:.6f} ± {result.std_error:.6f}")
    print(f"  VRF: {result.variance_reduction_factor:.1f}x")
    print(f"  Rel. error: {abs(result.estimate - true_prob)/true_prob * 100:.1f}%\n")

print("✓ Good proposal → large VRF")
print("⚠ Bad proposal → little or no improvement")

**Lesson:** Proposal choice is **critical**. Rules of thumb:
- Overlap with target where $|h(x)p(x)|$ is large
- Heavier tails than target (if unsure)
- For tail events: shift mean towards the tail

## 6. Pitfalls and Failure Modes

### Pitfall 1: Weight Degeneracy

If $q$ has lighter tails than $p$, a few samples get **huge weights** → variance explodes!

In [None]:
# Dangerous: proposal with lighter tails
def proposal_light_tails(n):
    return np.random.uniform(-1, 1, n)  # Bounded support!

def log_weight_light(x):
    # Target: N(0,1), Proposal: Uniform[-1,1]
    log_p = stats.norm.logpdf(x, 0, 1)
    log_q = stats.uniform.logpdf(x, -1, 2)  # Uniform[-1,1]
    return log_p - log_q

# Function with significant tail mass
def tail_sensitive_func(x):
    return x**2  # Grows in tails

sampler_bad = ImportanceSampler(proposal_light_tails, log_weight_light, seed=42)
try:
    result_bad = sampler_bad.estimate(tail_sensitive_func, 10000)
    print(f"Estimate: {result_bad.estimate:.6f}")
    print(f"Std error: {result_bad.std_error:.6f}")
    print(f"VRF: {result_bad.variance_reduction_factor:.6f}")
    print(f"\n⚠ VRF < 1 means IS is WORSE than naive MC!")
except Exception as e:
    print(f"❌ Error: {e}")
    print("\n⚠ Numerical issues from extreme weights")

### Pitfall 2: Infinite Variance

If $\sup_x |h(x)w(x)| = \infty$, the IS estimator has **infinite variance** even though it's unbiased!

In [None]:
# Example: Cauchy target, Gaussian proposal
# w(x) = Cauchy(x) / N(x) ~ 1/(1+x²) / exp(-x²/2) → unbounded as x→∞
print("Theoretical example (not running to avoid instability):")
print("  Target: Cauchy distribution (heavy tails)")
print("  Proposal: N(0,1) (light tails)")
print("  Result: w(x) → ∞ as |x| → ∞")
print("\n⚠ IS estimator is unbiased but has infinite variance!")
print("✓ Always check: proposal must have heavier tails than target")

### Pitfall 3: Numerical Underflow in Weights

**Always work in log space** when computing weights!

In [None]:
# BAD: Direct computation
x_test = np.array([10.0])  # Far in tail
p_x = stats.norm.pdf(x_test, 0, 1)
q_x = stats.norm.pdf(x_test, 3, 1)
w_direct = p_x / q_x
print(f"Direct weight computation:")
print(f"  p(x) = {p_x[0]:.2e}")
print(f"  q(x) = {q_x[0]:.2e}")
print(f"  w(x) = {w_direct[0]:.2e}  ← May underflow!\n")

# GOOD: Log-space computation
log_p = stats.norm.logpdf(x_test, 0, 1)
log_q = stats.norm.logpdf(x_test, 3, 1)
log_w = log_p - log_q
w_stable = np.exp(log_w)
print(f"Log-space weight computation:")
print(f"  log p(x) = {log_p[0]:.2f}")
print(f"  log q(x) = {log_q[0]:.2f}")
print(f"  log w(x) = {log_w[0]:.2f}")
print(f"  w(x) = {w_stable[0]:.2e}  ✓ Stable")

## 7. Key Takeaways

✓ **IS reduces variance** by sampling from $q$ concentrated where $|h(x)p(x)|$ is large

✓ **Unbiased estimator:** $E_q[h(X)w(X)] = E_p[h(X)]$

✓ **Speedup:** 10-1000x for tail probabilities, rare events

✓ **Optimal proposal:** $q^*(x) \propto |h(x)|p(x)$ (zero variance if $h \geq 0$)

✗ **Pitfall 1:** Light-tailed proposals → weight degeneracy

✗ **Pitfall 2:** Infinite variance if tails mismatch

✗ **Pitfall 3:** Numerical underflow → always use log-space weights

**Rule of thumb:** Proposal should have **heavier or equal tails** as target, and **overlap with high-mass regions of $|h(x)|p(x)$**

## 8. Exercises

**Exercise 1:** Estimate $P(X > 4)$ for $X \sim \mathcal{N}(0,1)$ using both naive MC and IS. What VRF do you achieve?

**Exercise 2:** What happens if you use $q = \mathcal{N}(5, 1)$ (shifted too far) for the $P(X > 3)$ problem? Measure the VRF.

**Exercise 3:** Estimate $E[X^4]$ for $X \sim \mathcal{N}(0,1)$ (true value = 3) using IS with $q = \mathcal{N}(0, 2)$. Does IS help or hurt?

**Exercise 4:** Plot the **effective sample size** $ESS = (\sum w_i)^2 / \sum w_i^2$ for different proposal shifts $\mu \in [0, 1, 2, 3, 4, 5]$. Where is ESS maximized?

**Exercise 5:** Implement **self-normalized IS** (divide by $\sum w_i$ instead of $N$). When is this preferred?

**Exercise 6:** Design a proposal for estimating $\int_0^\infty e^{-x} \mathbb{1}_{x > 10} dx$ (tail of exponential). Compare VRF with exponential proposals of different rates.

In [None]:
# Your solutions here

---
## Solutions (Spoilers!)

<details>
<summary>Click to reveal</summary>

```python
# Exercise 1
threshold_ex1 = 4.0
true_prob_ex1 = 1 - stats.norm.cdf(threshold_ex1)
target_ex1 = lambda x: (x > threshold_ex1).astype(float)
proposal_ex1 = lambda n: np.random.randn(n) + threshold_ex1
log_weight_ex1 = lambda x: stats.norm.logpdf(x, 0, 1) - stats.norm.logpdf(x, threshold_ex1, 1)

sampler_ex1 = ImportanceSampler(proposal_ex1, log_weight_ex1, seed=42)
result_ex1 = sampler_ex1.estimate(target_ex1, 10000)
print(f"True: {true_prob_ex1:.6f}")
print(f"IS: {result_ex1.estimate:.6f}")
print(f"VRF: {result_ex1.variance_reduction_factor:.1f}x")

# Exercise 4
def compute_ess(weights):
    return (np.sum(weights)**2) / np.sum(weights**2)

shifts = np.linspace(0, 5, 20)
ess_values = []
for shift in shifts:
    prop = lambda n: np.random.randn(n) + shift
    log_w = lambda x: stats.norm.logpdf(x, 0, 1) - stats.norm.logpdf(x, shift, 1)
    s = ImportanceSampler(prop, log_w, seed=42)
    # Sample and compute ESS
    samples = prop(1000)
    weights = np.exp(log_w(samples))
    ess_values.append(compute_ess(weights))

plt.plot(shifts, ess_values)
plt.xlabel('Proposal shift μ')
plt.ylabel('Effective Sample Size')
plt.title('ESS vs Proposal Location')
plt.grid(True)
plt.show()
```

</details>

---
**Next:** [03_control_variates_and_antithetic_sampling.ipynb](03_control_variates_and_antithetic_sampling.ipynb) - Learn complementary variance reduction tricks!