In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from pathlib import Path
from modules._import_helper import safe_import_from

# Import rare event tools
RareEventEstimator, tail_probability, adaptive_sampling = safe_import_from(
    '05_simulation_monte_carlo.src.rare_events',
    'RareEventEstimator', 'tail_probability', 'adaptive_sampling'
)

# Setup
np.random.seed(42)
reports_dir = Path('../reports')
reports_dir.mkdir(exist_ok=True)

plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 5)

## 1. The Rare Event Problem

**Definition:** Event with probability $P < 10^{-3}$ (less than 1 in 1000).

**Why naive MC fails:**
- Need $N \sim 1/P$ samples to get **one hit** on average
- For $P = 10^{-6}$: need ~1M samples for single hit!
- Standard error: $SE = \sqrt{P(1-P)/N} \approx \sqrt{P/N}$
- Relative error: $SE/P \approx \sqrt{1/(NP)} \to \infty$ as $P \to 0$

**Examples:**
- System failure probability
- Financial risk (VaR at 99.9% confidence)
- Particle physics cross-sections
- Network packet loss in high-reliability systems

## 2. Naive MC Breakdown Demo

Let's try to estimate $P(Z > 4)$ for $Z \sim \mathcal{N}(0,1)$ (true prob ≈ $3.17 \times 10^{-5}$)

In [None]:
threshold = 4.0
true_prob = 1 - stats.norm.cdf(threshold)

print(f"Target: P(Z > {threshold}) = {true_prob:.6e}\n")

# Try with different sample sizes
for n in [10**3, 10**4, 10**5, 10**6]:
    samples = np.random.randn(n)
    hits = np.sum(samples > threshold)
    p_hat = hits / n
    se = np.sqrt(p_hat * (1 - p_hat) / n) if p_hat > 0 else 0
    rel_error = (se / true_prob) if p_hat > 0 else float('inf')
    
    print(f"N={n:>7}: hits={hits:>4}, p̂={p_hat:.6e}, rel_SE={rel_error:.1%}")

print(f"\n⚠ Even with 1M samples: very few hits, huge relative error!")
print(f"✓ Need specialized rare-event methods")

## 3. Importance Sampling for Tail Probabilities

**Strategy:** Shift distribution to the tail!

For $P(Z > a)$ with $Z \sim \mathcal{N}(0,1)$:
- **Target:** $p(z) = \mathcal{N}(z; 0, 1)$
- **Proposal:** $q(z) = \mathcal{N}(z; \mu, 1)$ with $\mu \approx a$
- **Weight:** $w(z) = \exp\left(-\frac{z^2}{2} + \frac{(z-\mu)^2}{2}\right) = \exp(\mu z - \frac{\mu^2}{2})$

**Optimal shift:** $\mu^* = a$ (centers proposal on threshold)

In [None]:
# Compare naive vs IS
n_samples = 10000

# Naive
result_naive = tail_probability(
    threshold=threshold,
    distribution='normal',
    loc=0.0,
    scale=1.0,
    n_samples=n_samples,
    method='naive',
    seed=42
)

# Importance sampling
result_is = tail_probability(
    threshold=threshold,
    distribution='normal',
    loc=0.0,
    scale=1.0,
    n_samples=n_samples,
    method='importance',
    seed=42
)

print(f"True probability: {true_prob:.6e}\n")
print(f"Naive MC:")
print(f"  Estimate: {result_naive.probability:.6e} ± {result_naive.std_error:.6e}")
print(f"  Rel error: {result_naive.relative_error:.2%}")
print(f"  Hits: {result_naive.n_events}/{n_samples}\n")

print(f"Importance Sampling:")
print(f"  Estimate: {result_is.probability:.6e} ± {result_is.std_error:.6e}")
print(f"  Rel error: {result_is.relative_error:.2%}")
print(f"  Hits: {result_is.n_events}/{n_samples}")

improvement = result_naive.std_error / result_is.std_error if result_is.std_error > 0 else float('inf')
print(f"\n✓ IS is ~{improvement:.0f}x more efficient!")

## 4. Visualizing the Difference

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Left: Sampling distributions
x = np.linspace(-2, 8, 300)
ax1.plot(x, stats.norm.pdf(x, 0, 1), 'b-', linewidth=2, label='Target: N(0,1)')
ax1.plot(x, stats.norm.pdf(x, threshold, 1), 'orange', linewidth=2, 
         label=f'Proposal: N({threshold},1)')
ax1.axvline(threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold={threshold}')
ax1.fill_between(x[x>threshold], 0, stats.norm.pdf(x[x>threshold], 0, 1),
                  alpha=0.3, color='red', label='Rare region')
ax1.set_xlabel('z')
ax1.set_ylabel('Density')
ax1.set_title('Naive MC wastes samples outside rare region')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Right: Convergence
sample_sizes = np.logspace(2, 5, 15).astype(int)
errors_naive = []
errors_is = []

for n in sample_sizes:
    r_naive = tail_probability(threshold, 'normal', 0, 1, n, 'naive', seed=42)
    r_is = tail_probability(threshold, 'normal', 0, 1, n, 'importance', seed=42)
    errors_naive.append(max(r_naive.std_error, 1e-10))  # Avoid log(0)
    errors_is.append(max(r_is.std_error, 1e-10))

ax2.loglog(sample_sizes, errors_naive, 'o-', label='Naive MC', linewidth=2)
ax2.loglog(sample_sizes, errors_is, 's-', label='Importance Sampling', linewidth=2)
ax2.axhline(true_prob * 0.1, color='gray', linestyle='--', alpha=0.5,
            label='10% relative error')
ax2.set_xlabel('Number of samples')
ax2.set_ylabel('Standard error')
ax2.set_title('Convergence: Naive vs IS')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / '04_rare_events_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Plot saved to reports/04_rare_events_comparison.png")

## 5. Adaptive Sampling

**Problem:** Don't know how rare the event is beforehand!

**Solution:** Two-phase estimation:
1. **Pilot phase:** Small run to estimate $\hat{P}$
2. **Refinement:** Use $\hat{P}$ to compute needed $N$ for target accuracy

For target relative error $\epsilon$:
$$N \approx \frac{1-P}{\epsilon^2 P}$$

In [None]:
# Adaptive sampling example
threshold_adapt = 3.5
true_prob_adapt = 1 - stats.norm.cdf(threshold_adapt)

def event_func(x):
    return (x > threshold_adapt).astype(float)

def sampler(n):
    return np.random.randn(n)

result_adaptive = adaptive_sampling(
    event=event_func,
    sampler=sampler,
    n_pilot=1000,
    target_relative_error=0.1,  # 10% relative error
    max_samples=100000,
    seed=42
)

print(f"True probability: {true_prob_adapt:.6e}\n")
print(f"Adaptive sampling result:")
print(f"  Pilot samples: {result_adaptive.n_pilot}")
print(f"  Total samples: {result_adaptive.n_total}")
print(f"  Estimate: {result_adaptive.probability:.6e} ± {result_adaptive.std_error:.6e}")
print(f"  Relative error: {result_adaptive.relative_error:.2%}")
print(f"\n✓ Automatically determined N={result_adaptive.n_total} for target accuracy")

## 6. Common Pitfalls

### Pitfall 1: False Confidence from Zero Hits

**Problem:** $\hat{P} = 0$ doesn't mean $P = 0$!

In [None]:
# Extremely rare event
threshold_extreme = 5.0
true_prob_extreme = 1 - stats.norm.cdf(threshold_extreme)

n = 10000
samples = np.random.randn(n)
hits = np.sum(samples > threshold_extreme)

print(f"True P(Z > {threshold_extreme}) = {true_prob_extreme:.6e}")
print(f"Naive MC with N={n}: {hits} hits")
print(f"Estimate: {hits/n:.6e}")
print(f"\n⚠ Zero hits ≠ zero probability!")
print(f"✓ Upper bound: P < 3/N with 95% confidence (Rule of Three)")
print(f"   For our case: P < {3/n:.6e}")

### Pitfall 2: Over-shifting the Proposal

In [None]:
# Compare different proposal shifts
threshold_test = 3.0
true_test = 1 - stats.norm.cdf(threshold_test)

shifts = [2.0, 3.0, 4.0, 5.0]  # Shift proposals
print(f"True P(Z > {threshold_test}) = {true_test:.6e}\n")

for shift in shifts:
    def prop(n):
        return np.random.randn(n) + shift
    
    def log_weight(x):
        return stats.norm.logpdf(x, 0, 1) - stats.norm.logpdf(x, shift, 1)
    
    estimator = RareEventEstimator(seed=42)
    event = lambda x: (x > threshold_test).astype(float)
    result = estimator.estimate_importance_sampling(
        event, prop, log_weight, log_weight, n_samples=5000
    )
    
    print(f"Shift μ={shift}: rel_error={result.relative_error:.2%}, hits={result.n_events}")

print(f"\n✓ Optimal shift ≈ threshold (μ = {threshold_test})")
print(f"⚠ Over-shifting (μ >> threshold) wastes samples")

### Pitfall 3: Not Validating with Known Test Cases

In [None]:
# Validation workflow
print("✓ Always validate rare-event code:")
print("  1. Test on less rare event (P ~ 0.01) where naive MC works")
print("  2. Check IS gives same answer with lower variance")
print("  3. Gradually decrease P and verify consistency")
print("  4. Compare multiple IS proposals for robustness\n")

# Quick validation
for thresh in [2.0, 2.5, 3.0, 3.5, 4.0]:
    true_p = 1 - stats.norm.cdf(thresh)
    r_is = tail_probability(thresh, 'normal', 0, 1, 10000, 'importance', seed=42)
    error = abs(r_is.probability - true_p) / true_p
    print(f"P(Z>{thresh:.1f})={true_p:.2e}: IS error={error:.2%}")

print("\n✓ Consistent accuracy across probability range")

## 7. Key Takeaways

✓ **Rare events (P < 10^-3) require specialized methods**

✓ **IS with exponential tilting:** Shift $\mu \approx a$ for $P(Z > a)$

✓ **Adaptive sampling:** Use pilot to determine needed $N$

✓ **Zero hits ≠ zero probability** → Use Rule of Three for upper bounds

✗ **Pitfall:** Over-shifting proposal wastes samples

✗ **Pitfall:** Not validating on test cases with known P

**When to use:**
- $P < 10^{-3}$: Consider importance sampling
- $P < 10^{-6}$: IS essential
- $P < 10^{-9}$: May need splitting/multilevel methods (beyond this course)

## 8. Exercises

**Exercise 1:** Estimate $P(|Z| > 4)$ for $Z \sim \mathcal{N}(0,1)$ using symmetric proposals. What's the optimal strategy?

**Exercise 2:** For exponential distribution, estimate $P(X > 10)$ where $X \sim \text{Exp}(\lambda=1)$. Design an IS proposal.

**Exercise 3:** Implement the **Rule of Three**: If 0 events in $N$ trials, $P < 3/N$ with 95% confidence. Verify this with simulations.

**Exercise 4:** Compare adaptive vs fixed-N sampling for varying $P \in [10^{-2}, 10^{-5}]$. Plot total samples needed.

**Exercise 5:** **Challenge:** Estimate $P(S_n > na + b\sqrt{n})$ for random walk $S_n = \sum_{i=1}^n X_i$, $X_i \sim \mathcal{N}(0,1)$. Use large deviations theory to design optimal IS.

In [None]:
# Your solutions here

---
**Next:** [05_synthetic_physics_data_generator.ipynb](05_synthetic_physics_data_generator.ipynb) - Generate controlled datasets for ML!