In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from modules._import_helper import safe_import_from

# Import generators
(generate_brownian_motion, generate_ou_process, generate_levy_flight,
 PhysicsDataGenerator, add_correlated_noise) = safe_import_from(
    '05_simulation_monte_carlo.src.synthetic_generators',
    'generate_brownian_motion', 'generate_ou_process', 'generate_levy_flight',
    'PhysicsDataGenerator', 'add_correlated_noise'
)

# Setup
np.random.seed(42)
reports_dir = Path('../reports')
reports_dir.mkdir(exist_ok=True)

plt.style.use('default')
plt.rcParams['figure.figsize'] = (14, 5)

## 1. Why Synthetic Data?

**Advantages:**
- ✓ **Known ground truth** → can measure true error, not just validation error
- ✓ **Control complexity** → gradually increase difficulty to debug models
- ✓ **No data collection cost** → infinite training data
- ✓ **Reproducible** → same experiments, different researchers
- ✓ **Test edge cases** → simulate rare events, outliers

**When to use:**
- Algorithm development and debugging
- Benchmarking new methods
- Understanding model failure modes
- Ablation studies (isolate specific effects)

**When NOT to use:**
- Final production models (always validate on real data!)
- When synthetic ≠ real distribution (domain shift)

## 2. Stochastic Processes

### 2.1 Brownian Motion (Wiener Process)

$$dX_t = \mu dt + \sigma dW_t$$

- **Drift:** $\mu$ (constant trend)
- **Diffusion:** $\sigma$ (random fluctuations)
- **Applications:** Stock prices, particle diffusion, sensor noise

In [None]:
# Generate Brownian motion paths
t, paths_bm = generate_brownian_motion(
    t_max=10.0,
    n_steps=1000,
    mu=0.1,  # Positive drift
    sigma=1.0,
    n_paths=5,
    seed=42
)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Left: Sample paths
for i in range(paths_bm.shape[0]):
    ax1.plot(t, paths_bm[i], alpha=0.7, linewidth=1.5)
ax1.axhline(0, color='black', linestyle='--', alpha=0.3)
ax1.set_xlabel('Time')
ax1.set_ylabel('Position')
ax1.set_title('Brownian Motion (μ=0.1, σ=1.0)')
ax1.grid(True, alpha=0.3)

# Right: Distribution at final time
t_final, paths_many = generate_brownian_motion(10.0, 1000, 0.1, 1.0, n_paths=5000, seed=42)
final_values = paths_many[:, -1]
ax2.hist(final_values, bins=50, density=True, alpha=0.7, edgecolor='black')
ax2.axvline(np.mean(final_values), color='red', linestyle='--', 
            linewidth=2, label=f'Mean={np.mean(final_values):.2f}')
ax2.set_xlabel('Final Position')
ax2.set_ylabel('Density')
ax2.set_title('Distribution at t=10 (5000 paths)')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / '05_brownian_motion.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"Expected mean at t=10: μ*t = {0.1 * 10:.2f}")
print(f"Observed mean: {np.mean(final_values):.2f}")
print(f"Expected std: σ*sqrt(t) = {1.0 * np.sqrt(10):.2f}")
print(f"Observed std: {np.std(final_values):.2f}")

### 2.2 Ornstein-Uhlenbeck Process (Mean-Reverting)

$$dX_t = \theta(\mu - X_t)dt + \sigma dW_t$$

- **Mean reversion:** $\theta$ (speed of return to $\mu$)
- **Applications:** Interest rates, commodity prices, temperature

In [None]:
# Compare different mean reversion rates
fig, axes = plt.subplots(1, 3, figsize=(16, 4))
thetas = [0.5, 2.0, 5.0]

for ax, theta in zip(axes, thetas):
    t_ou, paths_ou = generate_ou_process(
        t_max=10.0,
        n_steps=1000,
        theta=theta,
        mu=0.0,
        sigma=1.0,
        x0=3.0,  # Start far from mean
        n_paths=10,
        seed=42
    )
    
    for i in range(paths_ou.shape[0]):
        ax.plot(t_ou, paths_ou[i], alpha=0.5, linewidth=1)
    ax.axhline(0, color='red', linestyle='--', linewidth=2, label='Mean μ=0')
    ax.set_xlabel('Time')
    ax.set_ylabel('Position')
    ax.set_title(f'OU Process (θ={theta})')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / '05_ou_process.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Larger θ → faster mean reversion")
print("✓ Useful for modeling systems with restoring forces")

### 2.3 Lévy Flights (Heavy-Tailed Jumps)

$$X_{t+1} = X_t + \Delta X, \quad \Delta X \sim \text{Lévy}(\alpha)$$

- **Stability parameter:** $\alpha \in (0, 2]$ 
  - $\alpha = 2$: Gaussian (Brownian motion)
  - $\alpha = 1$: Cauchy distribution
- **Applications:** Anomalous diffusion, animal foraging, financial crashes

In [None]:
# Compare different stability parameters
alphas = [0.5, 1.0, 1.5, 2.0]
fig, axes = plt.subplots(1, 4, figsize=(16, 4))

for ax, alpha in zip(axes, alphas):
    t_levy, paths_levy = generate_levy_flight(
        n_steps=1000,
        alpha=alpha,
        scale=0.1,
        n_paths=5,
        seed=42
    )
    
    for i in range(paths_levy.shape[0]):
        ax.plot(t_levy, paths_levy[i], alpha=0.7, linewidth=1)
    ax.set_xlabel('Step')
    ax.set_ylabel('Position')
    ax.set_title(f'Lévy Flight (α={alpha})')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / '05_levy_flights.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Smaller α → heavier tails → larger jumps")
print("✓ α=2 recovers Brownian motion")

## 3. Physics-Inspired Regression Problems

Generate datasets with known functional forms for ML benchmarking.

In [None]:
generator = PhysicsDataGenerator(seed=42)

# Problem 1: Damped Harmonic Oscillator
# x(t) = A*exp(-γt)*cos(ωt + φ)
X_damped, y_damped, desc_damped, truth_damped = generator.damped_oscillator(
    n_samples=200,
    noise_level=0.05
)

# Problem 2: Projectile Motion
# y = v₀*sin(θ)*t - 0.5*g*t²
X_proj, y_proj, desc_proj, truth_proj = generator.projectile(
    n_samples=200,
    noise_level=0.05
)

# Problem 3: Heat Diffusion
# T(x,t) = T₀*erf(x/(2*sqrt(α*t)))
X_heat, y_heat, desc_heat, truth_heat = generator.heat_diffusion(
    n_samples=200,
    noise_level=0.05
)

# Plot all three
fig, axes = plt.subplots(1, 3, figsize=(16, 4))
problems = [
    (X_damped, y_damped, truth_damped, "Damped Oscillator"),
    (X_proj, y_proj, truth_proj, "Projectile Motion"),
    (X_heat, y_heat, truth_heat, "Heat Diffusion")
]

for ax, (X, y, truth, title) in zip(axes, problems):
    # Noisy observations
    ax.scatter(X[:, 0], y, alpha=0.5, s=20, label='Noisy data')
    # Ground truth
    X_sorted = X[np.argsort(X[:, 0])]
    y_true = truth(X_sorted)
    ax.plot(X_sorted[:, 0], y_true, 'r-', linewidth=2, label='Ground truth')
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Target')
    ax.set_title(title)
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / '05_physics_regression_problems.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Each problem has:")
print("  - Features X (input variables)")
print("  - Targets y (noisy observations)")
print("  - Ground truth function (for true error measurement)")
print("  - Description (physical interpretation)")

## 4. ML Workflow Example: Testing a Model

Let's use synthetic data to test a simple regression model:

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Generate data
X, y, desc, ground_truth = generator.pendulum_energy(n_samples=500, noise_level=0.1)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Measure error (test set)
test_mse = np.mean((y_test - y_pred)**2)
test_rmse = np.sqrt(test_mse)

# Measure TRUE error (ground truth)
y_true = ground_truth(X_test)
true_mse = np.mean((y_true - y_pred)**2)
true_rmse = np.sqrt(true_mse)

# Noise level (difference between true and observed)
noise_mse = np.mean((y_test - y_true)**2)

print(f"Problem: {desc}\n")
print(f"Test RMSE (with noise): {test_rmse:.4f}")
print(f"True RMSE (vs ground truth): {true_rmse:.4f}")
print(f"Noise level: {np.sqrt(noise_mse):.4f}\n")
print(f"✓ Synthetic data lets us separate:")
print(f"  - Model error (true RMSE)")
print(f"  - Noise contribution (test RMSE - true RMSE)")

In [None]:
# Visualize predictions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Left: Predictions vs observations
ax1.scatter(y_test, y_pred, alpha=0.5, s=30)
lims = [min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())]
ax1.plot(lims, lims, 'r--', linewidth=2, label='Perfect prediction')
ax1.set_xlabel('Observed (noisy)')
ax1.set_ylabel('Predicted')
ax1.set_title('Predictions vs Noisy Observations')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Right: Predictions vs ground truth
ax2.scatter(y_true, y_pred, alpha=0.5, s=30, color='green')
lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())]
ax2.plot(lims, lims, 'r--', linewidth=2, label='Perfect prediction')
ax2.set_xlabel('Ground truth (noiseless)')
ax2.set_ylabel('Predicted')
ax2.set_title('Predictions vs Ground Truth')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / '05_ml_predictions.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Correlated Noise

Real data often has **correlated noise** (e.g., autocorrelation in time series).

Generate realistic noise using Ornstein-Uhlenbeck:

In [None]:
# Clean signal
t = np.linspace(0, 10, 200)
signal = np.sin(2 * np.pi * 0.5 * t)

# Add correlated noise
noisy_signal = add_correlated_noise(
    signal,
    noise_scale=0.3,
    correlation_time=0.5,  # Correlation length
    seed=42
)

# Add IID noise for comparison
np.random.seed(42)
iid_noise = signal + np.random.randn(len(signal)) * 0.3

# Plot
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 8))

ax1.plot(t, signal, 'k-', linewidth=2, label='True signal')
ax1.set_ylabel('Clean')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(t, iid_noise, alpha=0.7, label='IID noise')
ax2.plot(t, signal, 'k--', alpha=0.5, linewidth=1)
ax2.set_ylabel('IID Noise')
ax2.legend()
ax2.grid(True, alpha=0.3)

ax3.plot(t, noisy_signal, alpha=0.7, color='orange', label='Correlated noise')
ax3.plot(t, signal, 'k--', alpha=0.5, linewidth=1)
ax3.set_xlabel('Time')
ax3.set_ylabel('Correlated Noise')
ax3.legend()
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / '05_correlated_noise.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Correlated noise is smoother (autocorrelated)")
print("✓ More realistic for many applications (sensors, finance, etc.)")

## 6. Key Takeaways

✓ **Synthetic data = known ground truth** → measure true model error

✓ **Stochastic processes:**
- Brownian: Constant drift + diffusion
- OU: Mean-reverting
- Lévy: Heavy-tailed jumps

✓ **Physics problems** provide realistic functional forms for testing

✓ **Correlated noise** → more realistic than IID

✓ **Workflow:**
1. Generate synthetic data with ground truth
2. Train/test ML model
3. Measure true error (vs ground truth)
4. Debug model on controlled examples
5. Validate on real data

⚠ **Limitation:** Synthetic ≠ real (domain shift). Always validate on real data before deployment!

## 7. Exercises

**Exercise 1:** Generate a dataset mixing two physics problems (e.g., oscillator + projectile). Train a model and check if it can learn both components.

**Exercise 2:** Compare model performance (Random Forest, Linear Regression, Neural Network) on the heat diffusion problem. Which works best?

**Exercise 3:** Gradually increase noise level from 0 to 0.5. Plot model RMSE vs noise. Where does your model break down?

**Exercise 4:** Generate Brownian motion with different $\mu$ and $\sigma$. Train a model to **predict** the parameters from a single path.

**Exercise 5:** **Challenge:** Create a time-series forecasting problem using OU process. Generate sequences, train an LSTM, and measure true error vs prediction horizon.

**Exercise 6:** Add outliers (5% of points with 10x noise) to a physics dataset. Test robust regression methods (Huber, RANSAC) vs OLS.

In [None]:
# Your solutions here

---
## Summary: Module 05 Complete!

You've learned:
1. ✅ MC integration with error bars and convergence diagnostics
2. ✅ Variance reduction: IS (10-1000x speedup)
3. ✅ Control variates and antithetic sampling
4. ✅ Rare event estimation (P < 10^-6)
5. ✅ Synthetic data generation for ML

**Next steps:**
- Apply these methods to your research problems
- Read: *Monte Carlo Statistical Methods* by Robert & Casella
- Explore: Quasi-Monte Carlo, MCMC (Module 06), multilevel MC

**Key principle:** Always report uncertainty. MC estimates are random!