# PCA via SVD: Explained Variance and Reconstruction

This notebook explores Principal Component Analysis (PCA) through Singular Value Decomposition (SVD).

**Learning objectives:**
- Understand SVD geometry and PCA connection
- Visualize explained variance and scree plots
- Demonstrate reconstruction error vs number of components
- Apply PCA for dimensionality reduction

In [None]:
# Import Required Libraries
import sys
sys.path.append('/workspaces/Computational-ML-lab')

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import seaborn as sns

from modules.01_numerical_toolbox.src.linear_algebra import (
    pca_via_svd, condition_number, ridge_regularization, demonstrate_ill_conditioning
)
from modules.00_repo_standards.src.mlphys_core import set_seed

# Configure plotting
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
%matplotlib inline

## 1. Generate Correlated Data

Create synthetic data with known covariance structure.

In [None]:
# Create correlated 2D data for visualization
set_seed(42)

# Covariance with strong correlation
cov = np.array([[2.0, 1.5],
                [1.5, 1.0]])

X_2d = np.random.multivariate_normal([0, 0], cov, size=200)

# Plot original data
plt.figure(figsize=(8, 8))
plt.scatter(X_2d[:, 0], X_2d[:, 1], alpha=0.6, s=50)
plt.axhline(0, color='k', linestyle='--', linewidth=0.5)
plt.axvline(0, color='k', linestyle='--', linewidth=0.5)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.title("Original 2D Data (Correlated)")
plt.axis('equal')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Covariance matrix:\\n{np.cov(X_2d.T)}")

## 2. Apply PCA and Visualize Principal Components

In [None]:
# Perform PCA
pca_result = pca_via_svd(X_2d, n_components=2)

print("Principal Components (eigenvectors):")
print(pca_result.components)
print(f"\\nExplained variance: {pca_result.explained_variance}")
print(f"Explained variance ratio: {pca_result.explained_variance_ratio}")
print(f"Total variance explained: {np.sum(pca_result.explained_variance_ratio):.4f}")

# Visualize with principal components overlaid
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

# Original space with PCs
ax1.scatter(X_2d[:, 0], X_2d[:, 1], alpha=0.6, s=50, label='Data')
ax1.axhline(0, color='k', linestyle='--', linewidth=0.5)
ax1.axvline(0, color='k', linestyle='--', linewidth=0.5)

# Draw principal components as arrows
scale = 3.0
for i in range(2):
    pc = pca_result.components[:, i]
    length = np.sqrt(pca_result.explained_variance[i]) * scale
    ax1.arrow(0, 0, pc[0]*length, pc[1]*length, 
              head_width=0.2, head_length=0.3, fc=f'C{i+1}', ec=f'C{i+1}',
              linewidth=3, label=f'PC{i+1}')

ax1.set_xlabel("$x_1$")
ax1.set_ylabel("$x_2$")
ax1.set_title("Original Space with Principal Components")
ax1.legend()
ax1.axis('equal')
ax1.grid(True, alpha=0.3)

# Transformed space (PCA coordinates)
Z = pca_result.transform(X_2d)
ax2.scatter(Z[:, 0], Z[:, 1], alpha=0.6, s=50, color='C2')
ax2.axhline(0, color='k', linestyle='--', linewidth=0.5)
ax2.axvline(0, color='k', linestyle='--', linewidth=0.5)
ax2.set_xlabel("PC1")
ax2.set_ylabel("PC2")
ax2.set_title("PCA-Transformed Space (Decorrelated)")
ax2.axis('equal')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. High-Dimensional PCA and Scree Plot

Generate higher-dimensional data to demonstrate variance selection.

In [None]:
# Create 20D data with decaying eigenvalues
n_features = 20
eigenvalues = np.exp(-np.arange(n_features) / 3)  # Exponential decay
Q, _ = np.linalg.qr(np.random.randn(n_features, n_features))
Sigma = Q @ np.diag(eigenvalues) @ Q.T

X_high = np.random.multivariate_normal(np.zeros(n_features), Sigma, size=300)

# Perform PCA
pca_high = pca_via_svd(X_high)

# Plot scree plot and cumulative variance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Scree plot
ax1.bar(range(1, n_features + 1), pca_high.explained_variance_ratio, alpha=0.7, color='steelblue')
ax1.set_xlabel("Principal Component")
ax1.set_ylabel("Explained Variance Ratio")
ax1.set_title("Scree Plot")
ax1.grid(True, alpha=0.3, axis='y')

# Cumulative variance
cumulative_var = np.cumsum(pca_high.explained_variance_ratio)
ax2.plot(range(1, n_features + 1), cumulative_var, 'o-', linewidth=2, markersize=8, color='darkorange')
ax2.axhline(0.95, color='r', linestyle='--', label='95% threshold')
ax2.axhline(0.99, color='purple', linestyle='--', label='99% threshold')
ax2.set_xlabel("Number of Components")
ax2.set_ylabel("Cumulative Explained Variance")
ax2.set_title("Cumulative Variance Explained")
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find components for thresholds
n_95 = np.argmax(cumulative_var >= 0.95) + 1
n_99 = np.argmax(cumulative_var >= 0.99) + 1

print(f"Components needed for 95% variance: {n_95}/{n_features}")
print(f"Components needed for 99% variance: {n_99}/{n_features}")
print(f"\\nDimensionality reduction: {n_features} → {n_95} ({(1 - n_95/n_features)*100:.1f}% reduction)")

## 4. Reconstruction Error vs Number of Components

In [None]:
# Compute reconstruction error for different numbers of components
n_components_range = range(1, n_features + 1)
reconstruction_errors = []

for n_comp in n_components_range:
    Z = pca_high.transform(X_high, n_components=n_comp)
    X_reconstructed = pca_high.inverse_transform(Z)
    error = np.mean((X_high - X_reconstructed) ** 2)
    reconstruction_errors.append(error)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Absolute error
ax1.plot(n_components_range, reconstruction_errors, 'o-', linewidth=2, markersize=6, color='crimson')
ax1.set_xlabel("Number of Components")
ax1.set_ylabel("Mean Squared Reconstruction Error")
ax1.set_title("Reconstruction Error vs Components")
ax1.grid(True, alpha=0.3)

# Relative error (as fraction of total variance)
total_variance = np.sum(pca_high.explained_variance)
relative_errors = np.array(reconstruction_errors) / total_variance
ax2.semilogy(n_components_range, relative_errors, 'o-', linewidth=2, markersize=6, color='darkgreen')
ax2.set_xlabel("Number of Components")
ax2.set_ylabel("Relative Reconstruction Error (log scale)")
ax2.set_title("Relative Error Decay")
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Reconstruction error for different components:")
for n_comp in [1, 5, 10, 15, 20]:
    if n_comp <= n_features:
        error = reconstruction_errors[n_comp - 1]
        print(f"  {n_comp:2d} components: MSE = {error:.6f}")

## 5. Ill-Conditioning and Ridge Stabilization

Demonstrate how ridge regularization improves numerical stability.

In [None]:
# Demonstrate ill-conditioning effects
result = demonstrate_ill_conditioning(kappa_target=1000.0, n_samples=100, n_features=50, seed=42)

print(f"Matrix condition number: κ = {result['condition_number']:.1f}")
print(f"\\nOLS solution norm: {result['ols_norm']:.2f}")
print(f"True weights norm: {result['true_norm']:.2f}")
print("\\nRidge regularization results:")
print(f"{'λ':<10s} {'Weight norm':<15s} {'κ_before':<15s} {'κ_after':<15s} {'Train MSE':<15s}")

for lambda_, ridge_info in result['ridge_results'].items():
    print(f"{lambda_:<10.2f} "
          f"{ridge_info['norm']:<15.2f} "
          f"{ridge_info['kappa_before']:<15.1f} "
          f"{ridge_info['kappa_after']:<15.1f} "
          f"{ridge_info['train_mse']:<15.6f}")

# Visualize conditioning improvement
lambdas = list(result['ridge_results'].keys())
kappas_after = [result['ridge_results'][l]['kappa_after'] for l in lambdas]
weight_norms = [result['ridge_results'][l]['norm'] for l in lambdas]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Condition number vs lambda
ax1.semilogx(lambdas, kappas_after, 'o-', linewidth=2, markersize=10, color='steelblue')
ax1.axhline(result['condition_number'], color='r', linestyle='--', label='Original κ')
ax1.set_xlabel("Ridge Parameter λ")
ax1.set_ylabel("Condition Number (after ridge)")
ax1.set_title("Conditioning Improvement with Ridge")
ax1.legend()
ax1.grid(True, alpha=0.3)

# Weight norm vs lambda (shrinkage effect)
ax2.semilogx(lambdas, weight_norms, 'o-', linewidth=2, markersize=10, color='darkorange')
ax2.axhline(result['true_norm'], color='g', linestyle='--', label='True norm')
ax2.axhline(result['ols_norm'], color='r', linestyle='--', label='OLS norm')
ax2.set_xlabel("Ridge Parameter λ")
ax2.set_ylabel("Solution Norm ||w||")
ax2.set_title("Weight Shrinkage Effect")
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\\nKey insight: Ridge adds λ to all eigenvalues, reducing κ = σ_max/σ_min")

## 6. Key Takeaways

**PCA via SVD:**
- More stable than eigendecomposition of covariance matrix
- $X = U \Sigma V^T$ gives PCs as columns of $V$
- Variance = $\sigma_i^2 / (n-1)$

**Dimensionality reduction:**
- Keep components explaining 95-99% of variance
- Balance reconstruction error vs compression
- Scree plot helps identify "elbow"

**Numerical stability:**
- Ill-conditioning ($\kappa \gg 1$) amplifies errors
- Ridge regularization: $\min ||Xw - y||^2 + \lambda ||w||^2$
- Adds $\lambda$ to eigenvalues, reducing condition number

**ML applications:**
- PCA for feature extraction and visualization
- Ridge regression prevents overfitting on correlated features
- Understanding $\kappa$ guides preprocessing choices