#### Gradients - Analytic vs PyTorch

In [None]:
import torch
import numpy as np

# Small dummy data (1 feature, 5 samples)
np.random.seed(0)

X_np = np.random.randn(1, 5)
y_np = np.random.randn(1, 5)
m = X_np.shape[1]

# Convert to torch tensors
X = torch.tensor(X_np, dtype=torch.float32)
y = torch.tensor(y_np, dtype=torch.float32)

# Initialize weights and bias 
w_np = np.random.randn(1, 1)
b_np = np.random.randn(1, 1)

# Convert to torch tensors
w = torch.tensor(w_np, dtype=torch.float32, requires_grad=True)
b = torch.tensor(b_np, dtype=torch.float32, requires_grad=True)

# ----------- PyTorch gradient computation -------------
z = w.T @ X + b
loss = torch.sum((z - y) ** 2) / m
loss.backward()

# Extract gradients from PyTorch
dw_torch = w.grad.clone().detach().numpy()
db_torch = b.grad.clone().detach().numpy()

# ----------- Analytic gradient computation (NumPy) -------------
z_np = w_np.T @ X_np + b_np
dw_np = 2*(1/m) * X_np @ (z_np - y_np).T
db_np = 2*(1/m) * np.ones((1, m)) @ (z_np - y_np).T

# ---- Compute gradients with two sided finite difference -----
def compute_finite_difference_gradient(X, y, w, b, epsilon=1e-5):
    """
    Compute gradients using two-sided finite difference for any parameters w and b.
    
    Args:
        X (numpy.ndarray): Input features.
        y (numpy.ndarray): Target values.
        w (numpy.ndarray): Weights.
        b (numpy.ndarray): Bias.
        epsilon (float): Small perturbation value for finite difference.

    Returns:
        dw (numpy.ndarray): Gradient w.r.t. weights.
        db (numpy.ndarray): Gradient w.r.t. bias.
    """

    m = X.shape[1]

    def compute_gradient_for_param(param, index):
        """
        Compute the gradient for a specific parameter (w or b) at a given index.
        
        Args:
            param (numpy.ndarray): The parameter to compute the gradient for (w or b).
            index (int): The index of the parameter to perturb.

        Returns:
            gradient (float): The computed gradient for the parameter at the given index.
        """
        param[index] += epsilon
        z_plus = w.T @ X + b
        loss_plus = np.sum((z_plus - y) ** 2) / m

        param[index] -= 2 * epsilon
        z_minus = w.T @ X + b
        loss_minus = np.sum((z_minus - y) ** 2) / m

        param[index] += epsilon  # Reset to original value

        return (loss_plus - loss_minus) / (2 * epsilon)

    # Gradient w.r.t. w
    dw = np.zeros_like(w)
    for i in range(w.shape[0]):
        dw[i] = compute_gradient_for_param(w, i)

    # Gradient w.r.t. b
    db = np.zeros_like(b)
    for i in range(b.shape[0]):
        db[i] = compute_gradient_for_param(b, i)

    return dw, db    

# Compute gradients using finite difference
dw_fd, db_fd = compute_finite_difference_gradient(X_np, y_np, w_np, b_np)

# ----------- Compare Gradients -------------
# Analytic vs PyTorch
dw_diff_analytic_torch = np.abs(dw_np - dw_torch)
db_diff_analytic_torch = np.abs(db_np - db_torch)

# Analytic vs Finite Difference
dw_diff_analytic_fd = np.abs(dw_np - dw_fd)
db_diff_analytic_fd = np.abs(db_np - db_fd)

# Print comparisons
print("Gradient w.r.t w (Analytic):", dw_np)
print("Gradient w.r.t w (PyTorch):", dw_torch)
print("Gradient w.r.t w (Finite Difference):", dw_fd)

print("Difference (Analytic vs PyTorch):", dw_diff_analytic_torch)
print("Difference (Analytic vs Finite Difference):", dw_diff_analytic_fd)

print("Gradient w.r.t b (Analytic):", db_np)
print("Gradient w.r.t b (PyTorch):", db_torch)
print("Gradient w.r.t b (Finite Difference):", db_fd)

print("Difference (Analytic vs PyTorch):", db_diff_analytic_torch)
print("Difference (Analytic vs Finite Difference):", db_diff_analytic_fd)

