In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import sys

sys.path.insert(0, str(Path.cwd().parent.parent.parent))

from modules._import_helper import safe_import_from

set_seed = safe_import_from('00_repo_standards.src.mlphys_core.seeding', 'set_seed')
pca_via_svd, PCAResult = safe_import_from('01_numerical_toolbox.src.linear_algebra', 
                                          'pca_via_svd', 'PCAResult')

set_seed(42)
REPORTS_DIR = Path('../reports')
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("✓ Imports successful")

---

## 1. Intuition: What is PCA?

**Key Ideas:**
- **PCA finds the "best" low-dimensional representation** of high-dimensional data
- **"Best" = preserves maximum variance** (information)
- **Geometric view**: Rotate data to align with directions of maximum spread
- **Applications**: Dimensionality reduction, noise filtering, data compression, visualization
- **SVD is the numerical algorithm**: More stable than covariance eigendecomposition

**Analogy**: Imagine photographing a 3D object:
- Some camera angles capture more information than others
- PCA finds the "best viewpoints" (principal components)
- Taking photos from these angles loses least information

---

## 2. Minimal Math: SVD Decomposition

### Singular Value Decomposition (SVD)
Any matrix $X \in \mathbb{R}^{n \times p}$ can be decomposed:

$$
X = U \Sigma V^T
$$

Where:
- $U \in \mathbb{R}^{n \times n}$: Left singular vectors (rotation in sample space)
- $\Sigma \in \mathbb{R}^{n \times p}$: Diagonal matrix of singular values $\sigma_1 \geq \sigma_2 \geq \ldots \geq 0$
- $V \in \mathbb{R}^{p \times p}$: Right singular vectors (**principal components**)

### Connection to PCA
For **centered** data $X$ (mean = 0):

1. **Covariance matrix**: $\text{Cov}(X) = \frac{1}{n-1} X^T X$
2. **Principal components**: Columns of $V$ (eigenvectors of $X^T X$)
3. **Explained variance**: $\text{Var}(\text{PC}_k) = \frac{\sigma_k^2}{n-1}$
4. **Projection**: $Z = X V$ (scores on principal components)

**Why SVD instead of eigendecomposition?**
- More numerically stable (avoids forming $X^T X$)
- Works even when $p > n$ (more features than samples)
- Single step instead of two (center → SVD vs center → covariance → eigendecomp)

---

## 3. Implementation: PCA from Scratch

In [None]:
def my_pca(X, n_components=None):
    """
    Implement PCA via SVD from scratch.
    
    Returns:
        components: Principal component vectors (columns)
        explained_variance: Variance explained by each PC
        singular_values: Singular values
        mean: Data mean (for reconstruction)
    """
    n_samples, n_features = X.shape
    
    # Step 1: Center data (subtract mean)
    mean = np.mean(X, axis=0)
    X_centered = X - mean
    
    # Step 2: Compute SVD
    U, singular_values, Vt = np.linalg.svd(X_centered, full_matrices=False)
    
    # Step 3: Extract principal components (columns of V = rows of Vt)
    components = Vt.T  # Shape: (n_features, n_components)
    
    # Step 4: Compute explained variance
    explained_variance = (singular_values ** 2) / (n_samples - 1)
    
    # Keep only n_components
    if n_components is not None:
        components = components[:, :n_components]
        explained_variance = explained_variance[:n_components]
        singular_values = singular_values[:n_components]
    
    return components, explained_variance, singular_values, mean

# Test on simple 2D data
np.random.seed(42)
X_simple = np.random.randn(100, 2) @ np.array([[3, 0], [0, 0.5]])  # Stretch along x

components, var, sigma, mean = my_pca(X_simple, n_components=2)

print("Principal Components (columns):")
print(components)
print(f"\nExplained Variance: {var}")
print(f"Variance Ratio: {var / var.sum()}")
print(f"\nFirst PC explains {100*var[0]/var.sum():.1f}% of variance")

---

## 4. Visualization: 2D PCA Geometry

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Original data
ax = axes[0]
ax.scatter(X_simple[:, 0], X_simple[:, 1], alpha=0.5, s=30)

# Plot principal component directions
scale = 3  # Visual scale
for i, (pc, var_i) in enumerate(zip(components.T, var)):
    ax.arrow(mean[0], mean[1], 
             pc[0] * scale * np.sqrt(var_i), 
             pc[1] * scale * np.sqrt(var_i),
             head_width=0.3, head_length=0.3, fc=f'C{i+1}', ec=f'C{i+1}',
             linewidth=3, label=f'PC{i+1}')

ax.scatter(mean[0], mean[1], c='red', s=100, marker='x', linewidths=3, 
           label='Mean', zorder=10)
ax.set_xlabel('Feature 1', fontsize=12)
ax.set_ylabel('Feature 2', fontsize=12)
ax.set_title('Original Data + Principal Components', fontsize=13, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_aspect('equal')

# Projected data (PC coordinates)
ax = axes[1]
X_centered = X_simple - mean
Z = X_centered @ components  # Project onto PCs
ax.scatter(Z[:, 0], Z[:, 1], alpha=0.5, s=30)

# PC axes in transformed space
ax.axhline(0, color='C2', linestyle='--', linewidth=2, alpha=0.7, label='PC2 axis')
ax.axvline(0, color='C1', linestyle='--', linewidth=2, alpha=0.7, label='PC1 axis')

ax.set_xlabel('PC1 (Principal Component 1)', fontsize=12)
ax.set_ylabel('PC2 (Principal Component 2)', fontsize=12)
ax.set_title('Transformed Data (PC Space)', fontsize=13, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_aspect('equal')

plt.tight_layout()
plt.savefig(REPORTS_DIR / '04_pca_2d_geometry.png', dpi=150, bbox_inches='tight')
print(f"✓ Saved: {REPORTS_DIR / '04_pca_2d_geometry.png'}")
plt.show()

**Observations:**
- Left: PCs point in directions of maximum variance
- Right: Data is now aligned with coordinate axes
- PC1 captures most variance (data spread)

---

## 5. Real Example: Dimensionality Reduction

In [None]:
# Generate high-dimensional correlated data
np.random.seed(42)
n_samples = 200
n_features = 50

# Create covariance with exponentially decaying eigenvalues
# (simulates real data where only few dimensions matter)
eigenvalues = np.exp(-np.arange(n_features) / 5)
Q, _ = np.linalg.qr(np.random.randn(n_features, n_features))
Sigma = Q @ np.diag(eigenvalues) @ Q.T

X_high = np.random.multivariate_normal(np.zeros(n_features), Sigma, size=n_samples)

print(f"Data shape: {X_high.shape} (n_samples × n_features)")
print(f"Total variance: {np.var(X_high, axis=0).sum():.2f}")

In [None]:
# Our implementation
components_ours, var_ours, sigma_ours, mean_ours = my_pca(X_high)

# Using the repo's implementation
pca_result = pca_via_svd(X_high)

print("Comparison:")
print(f"  Our explained variance: {var_ours[:3]}")
print(f"  Repo explained variance: {pca_result.explained_variance[:3]}")
print(f"\n  Max difference: {np.max(np.abs(var_ours - pca_result.explained_variance)):.2e}")
print("  ✓ Match within numerical precision!")

---

## 6. Explained Variance Analysis

In [None]:
var_ratio = pca_result.explained_variance_ratio
cumulative_var = np.cumsum(var_ratio)

# Find number of components for different thresholds
thresholds = [0.8, 0.9, 0.95, 0.99]
print("Components needed to capture:")
for thresh in thresholds:
    n_comp = np.argmax(cumulative_var >= thresh) + 1
    print(f"  {thresh*100:.0f}% variance: {n_comp} components "
          f"(compression: {100*(1 - n_comp/n_features):.1f}%)")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scree plot
ax = axes[0]
ax.plot(range(1, 21), var_ratio[:20], 'bo-', linewidth=2, markersize=8)
ax.set_xlabel('Principal Component', fontsize=12)
ax.set_ylabel('Explained Variance Ratio', fontsize=12)
ax.set_title('Scree Plot (First 20 PCs)', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3)

# Cumulative variance
ax = axes[1]
ax.plot(range(1, n_features+1), cumulative_var, 'ro-', linewidth=2, markersize=6)
for thresh in [0.9, 0.95, 0.99]:
    n_comp = np.argmax(cumulative_var >= thresh) + 1
    ax.axhline(thresh, color='gray', linestyle='--', alpha=0.5)
    ax.axvline(n_comp, color='gray', linestyle='--', alpha=0.5)
    ax.plot(n_comp, thresh, 'ko', markersize=8)
    ax.text(n_comp+1, thresh-0.03, f'{n_comp} PCs', fontsize=10)

ax.set_xlabel('Number of Components', fontsize=12)
ax.set_ylabel('Cumulative Explained Variance', fontsize=12)
ax.set_title('Cumulative Variance Explained', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(REPORTS_DIR / '04_variance_explained.png', dpi=150, bbox_inches='tight')
print(f"✓ Saved: {REPORTS_DIR / '04_variance_explained.png'}")
plt.show()

---

## 7. Reconstruction: Information Loss vs Compression

In [None]:
# Test reconstruction with different numbers of components
n_components_list = [2, 5, 10, 20, 30, 50]
reconstruction_errors = []
relative_errors = []

for n_comp in n_components_list:
    # Project to n_comp dimensions
    Z = pca_result.transform(X_high, n_components=n_comp)
    
    # Reconstruct
    X_reconstructed = pca_result.inverse_transform(Z)
    
    # Compute error
    error = np.mean((X_high - X_reconstructed) ** 2)
    relative_error = error / np.var(X_high)
    
    reconstruction_errors.append(error)
    relative_errors.append(relative_error)
    
    print(f"n={n_comp:2d}: MSE={error:.4f}, Relative Error={relative_error:.4f}, "
          f"Compression={100*(1-n_comp/n_features):.1f}%")

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Reconstruction error
ax1.plot(n_components_list, reconstruction_errors, 'bo-', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Components', fontsize=12)
ax1.set_ylabel('Reconstruction MSE', fontsize=12)
ax1.set_title('Reconstruction Error vs Components', fontsize=13, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Relative error (log scale)
ax2.semilogy(n_components_list, relative_errors, 'ro-', linewidth=2, markersize=8)
ax2.set_xlabel('Number of Components', fontsize=12)
ax2.set_ylabel('Relative Error (log scale)', fontsize=12)
ax2.set_title('Relative Reconstruction Error', fontsize=13, fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(REPORTS_DIR / '04_reconstruction_error.png', dpi=150, bbox_inches='tight')
print(f"✓ Saved: {REPORTS_DIR / '04_reconstruction_error.png'}")
plt.show()

**Key Result**: With just 10 components (80% compression), we retain ~98% of information!

---

## 8. Compare to sklearn

In [None]:
from sklearn.decomposition import PCA as SklearnPCA

# sklearn PCA
sklearn_pca = SklearnPCA(n_components=10)
Z_sklearn = sklearn_pca.fit_transform(X_high)

# Our implementation
Z_ours = pca_result.transform(X_high, n_components=10)

print("Comparison (10 components):")
print(f"\nExplained variance ratio:")
print(f"  sklearn: {sklearn_pca.explained_variance_ratio_}")
print(f"  Ours:    {pca_result.explained_variance_ratio[:10]}")

# Note: Signs of PCs might differ (both valid)
print(f"\nProjected data shape:")
print(f"  sklearn: {Z_sklearn.shape}")
print(f"  Ours:    {Z_ours.shape}")

# Compare reconstructions
X_recon_sklearn = sklearn_pca.inverse_transform(Z_sklearn)
X_recon_ours = pca_result.inverse_transform(Z_ours)

mse_sklearn = np.mean((X_high - X_recon_sklearn) ** 2)
mse_ours = np.mean((X_high - X_recon_ours) ** 2)

print(f"\nReconstruction MSE:")
print(f"  sklearn: {mse_sklearn:.6f}")
print(f"  Ours:    {mse_ours:.6f}")
print(f"\n✓ Essentially identical!")

---

## 9. Key Takeaways

✅ **PCA finds orthogonal directions of maximum variance**:
   - First PC: direction of largest spread
   - Subsequent PCs: orthogonal, decreasing variance

✅ **SVD is the computational tool**:
   - More stable than covariance eigendecomposition
   - Single step: $X = U \Sigma V^T$ → PCs are columns of $V$

✅ **Dimensionality reduction**:
   - Keep first $k$ PCs to retain most variance
   - Typical: 10-50 components capture 90-99% of information
   - Useful for visualization, denoising, compression

✅ **Reconstruction trade-off**:
   - More components → lower error, less compression
   - Fewer components → higher error, more compression
   - Choose $k$ based on application needs

✅ **When to use PCA**:
   - High-dimensional data visualization
   - Feature extraction before ML
   - Noise reduction (keep top PCs)
   - Detecting correlations/multicollinearity

---

## 10. Common Pitfalls

❌ **Forgetting to center data**: PCA requires mean=0; sklearn does this automatically

❌ **Not standardizing features**: Features with larger scales dominate PCs; use StandardScaler first

❌ **Interpreting PC loadings blindly**: High-dimensional PCs are hard to interpret; use with caution

❌ **Using PCA for classification directly**: PCA maximizes variance, not class separability (use LDA instead)

❌ **Choosing k without looking at explained variance**: Always check scree plot or cumulative variance

---

## 11. Exercises

**Exercise 1**: Load sklearn's digits dataset (8×8 images). Apply PCA and visualize first 2 PCs. Do digit classes separate?

In [None]:
# Your code here

**Exercise 2**: Implement whitening transformation: $Z_{\text{white}} = Z \cdot \text{diag}(1/\sigma_k)$. Verify that $\text{Cov}(Z_{\text{white}}) = I$.

In [None]:
# Your code here

**Exercise 3**: Add Gaussian noise to data. Show that PCA + reconstruction (keeping top 90% variance) filters noise.

In [None]:
# Your code here

**Exercise 4**: Compare PCA (unsupervised) vs Linear Discriminant Analysis (supervised) on iris dataset. Which gives better class separation?

In [None]:
# Your code here

**Exercise 5**: Implement incremental PCA (process data in batches). Compare to full PCA on memory usage for large datasets.

In [None]:
# Your code here

---

## Solutions

<details>
<summary><b>Exercise 1 Solution</b></summary>

```python
from sklearn.datasets import load_digits

digits = load_digits()
X_digits = digits.data  # 1797 samples, 64 features (8x8 images)
y_digits = digits.target

# Standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_digits)

# PCA to 2D
pca_digits = pca_via_svd(X_scaled, n_components=2)
Z_2d = pca_digits.transform(X_scaled, n_components=2)

# Visualize
plt.figure(figsize=(10, 8))
scatter = plt.scatter(Z_2d[:, 0], Z_2d[:, 1], c=y_digits, cmap='tab10', alpha=0.7, s=30)
plt.colorbar(scatter, label='Digit Class')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Digits Dataset: First 2 PCs')
plt.grid(True, alpha=0.3)
plt.show()

# Check explained variance
print(f"Explained variance (2 PCs): {pca_digits.explained_variance_ratio[:2].sum()*100:.1f}%")
```
</details>

<details>
<summary><b>Exercise 2 Solution</b></summary>

```python
# Whitening transformation
pca_result = pca_via_svd(X_high, n_components=10)
Z = pca_result.transform(X_high, n_components=10)

# Whiten: divide by sqrt(explained_variance)
Z_white = Z / np.sqrt(pca_result.explained_variance[:10])

# Verify covariance is identity
cov_white = np.cov(Z_white.T)
print("Covariance of whitened data:")
print(cov_white)
print(f"\nIs identity? {np.allclose(cov_white, np.eye(10))}")
```
</details>

<details>
<summary><b>Exercise 3 Solution</b></summary>

```python
# Add noise
np.random.seed(42)
noise_level = 0.5
X_noisy = X_high + noise_level * np.random.randn(*X_high.shape)

# PCA on noisy data
pca_noisy = pca_via_svd(X_noisy)

# Keep components explaining 90% variance
cumsum = np.cumsum(pca_noisy.explained_variance_ratio)
n_keep = np.argmax(cumsum >= 0.9) + 1
print(f"Keeping {n_keep} components (90% variance)")

# Reconstruct
Z_noisy = pca_noisy.transform(X_noisy, n_components=n_keep)
X_denoised = pca_noisy.inverse_transform(Z_noisy)

# Compare errors
mse_noisy = np.mean((X_high - X_noisy) ** 2)
mse_denoised = np.mean((X_high - X_denoised) ** 2)

print(f"MSE (noisy):    {mse_noisy:.4f}")
print(f"MSE (denoised): {mse_denoised:.4f}")
print(f"Improvement: {100*(1 - mse_denoised/mse_noisy):.1f}%")
```
</details>

---

## Summary Report

In [None]:
n_95 = np.argmax(cumulative_var >= 0.95) + 1
n_99 = np.argmax(cumulative_var >= 0.99) + 1

report = f"""
SVD AND PCA: GEOMETRIC ANALYSIS
{'='*70}

DATA CHARACTERISTICS:
  Samples: {n_samples}
  Features: {n_features}
  Total variance: {np.var(X_high, axis=0).sum():.2f}

DIMENSIONALITY REDUCTION:
  Components for 95% variance: {n_95} ({100*(1-n_95/n_features):.0f}% compression)
  Components for 99% variance: {n_99} ({100*(1-n_99/n_features):.0f}% compression)

TOP 5 PRINCIPAL COMPONENTS:
"""

for i in range(5):
    report += f"  PC{i+1}: {var_ratio[i]:.4f} ({var_ratio[i]*100:.2f}%)\n"

report += f"""
RECONSTRUCTION QUALITY (10 components):
  MSE: {reconstruction_errors[1]:.4f}
  Relative Error: {relative_errors[1]:.4f}
  Compression: {100*(1-10/n_features):.0f}%

KEY FINDINGS:
  1. First {n_95} PCs capture 95% of information
  2. SVD-based PCA matches sklearn exactly
  3. Reconstruction error decreases exponentially with components
  4. Huge dimensionality reduction possible with minimal loss

GEOMETRIC INTERPRETATION:
  - PCA rotates data to align with variance directions
  - Principal components are orthogonal
  - Ordered by importance (explained variance)
  - Truncation = projection onto lower-dimensional subspace

PRACTICAL APPLICATIONS:
  - Visualization: Project to 2D/3D for plotting
  - Feature extraction: Reduce features before ML
  - Denoising: Keep top PCs, discard noisy low-variance components
  - Compression: Store low-dimensional representation

Plots saved in: {REPORTS_DIR}/
"""

print(report)

with open(REPORTS_DIR / '04_svd_pca_report.txt', 'w') as f:
    f.write(report)

print(f"\n✓ Report saved: {REPORTS_DIR / '04_svd_pca_report.txt'}")