In [1]:
import numpy as np

In [2]:
class SGD:
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.lr = learning_rate
        self.momentum = momentum
        self.velocities = {} # To store momentum for each weight/bias

    def update(self, layer_id, W, dW, b, db):
        # Initialize velocity for this layer if not exists
        if layer_id not in self.velocities:
            self.velocities[layer_id] = {'W': np.zeros_like(W), 'b': np.zeros_like(b)}
        
        v = self.velocities[layer_id]
        
        # Calculate Momentum: v = momentum * v - lr * gradient
        v['W'] = self.momentum * v['W'] - self.lr * dW
        v['b'] = self.momentum * v['b'] - self.lr * db
        
        # Update parameters
        new_W = W + v['W']
        new_b = b + v['b']
        
        return new_W, new_b

class Adam:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {} # First moment (mean of gradients)
        self.v = {} # Second moment (uncentered variance)
        self.t = 0  # Time step

    def update(self, layer_id, W, dW, b, db):
        if layer_id not in self.m:
            self.m[layer_id] = {'W': np.zeros_like(W), 'b': np.zeros_like(b)}
            self.v[layer_id] = {'W': np.zeros_like(W), 'b': np.zeros_like(b)}
        
        self.t += 1
        
        # Update for Weight (W)
        self.m[layer_id]['W'] = self.beta1 * self.m[layer_id]['W'] + (1 - self.beta1) * dW
        self.v[layer_id]['W'] = self.beta2 * self.v[layer_id]['W'] + (1 - self.beta2) * (dW**2)
        
        # Bias correction (makes learning stable at the very start)
        m_hat = self.m[layer_id]['W'] / (1 - self.beta1**self.t)
        v_hat = self.v[layer_id]['W'] / (1 - self.beta2**self.t)
        
        new_W = W - self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
        
        # (Same logic for bias b... simplified here for brevity)
        new_b = b - self.lr * db 
        
        return new_W, new_b

### Why Adam?
"Why did you use Adam for your project?"
1. Adaptive Learning Rates: I don't have to spend weeks tuning the learning rate because Adam adjusts it for me.
2. Handles Sparse Gradients: If some features are rare (like certain words in text), Adam handles them better than SGD.
3. Faster Convergence: It usually reaches a low loss much faster than standard Gradient Descent.

Compare the SGD update and the Adam update.
Notice how Adam uses dW**2 (the squared gradient).
Question for you: If a weight has a very large and shaky gradient, what will happen to the value of v_hat (the variance), and how will that affect the size of the weight update?
(Hint: Look at the line new_W = W - lr * m_hat / np.sqrt(v_hat). If v_hat is big, what happens to the division?)

If a weight has a large and shaky gradient, the value of v_hat(the variance) will become very large.
The result:
The total weight update becomes smaller.
Why is this brilliant?
Adam acts like an automatic braking system. If a weight is oscillating wildly (shaky gradients), Adam thinks, "Whoa, this weight is unstable, let's take smaller steps." If a weight has a small, consistent gradient, Adam thinks, "This path is safe, let's speed up," and the small denominator makes the step size larger.

In [3]:
import numpy as np

def simulate_adam_step(gradient_history, lr=0.001):
    beta1 = 0.9
    beta2 = 0.999
    epsilon = 1e-8
    
    m = 0 # First moment
    v = 0 # Second moment
    
    # Simulate 10 time steps of training
    for t, dW in enumerate(gradient_history, 1):
        m = beta1 * m + (1 - beta1) * dW
        v = beta2 * v + (1 - beta2) * (dW**2)
        
        # Bias correction
        m_hat = m / (1 - beta1**t)
        v_hat = v / (1 - beta2**t)
        
        # The effective step size
        step = lr * m_hat / (np.sqrt(v_hat) + epsilon)
        
    return step

# Case 1: The Stable Neuron (Gradients are consistently small)
stable_grads = [0.1, 0.11, 0.09, 0.1, 0.1]
step_stable = simulate_adam_step(stable_grads)

# Case 2: The Shaky Neuron (Gradients are huge and jumping)
shaky_grads = [10.0, -10.0, 10.0, -10.0, 10.0]
step_shaky = simulate_adam_step(shaky_grads)

print(f"Adam step for Stable Neuron: {step_stable:.6f}")
print(f"Adam step for Shaky Neuron:  {step_shaky:.6f}")
print(f"\nObservation: Even though the shaky gradients were 100x larger,")
print(f"Adam kept the step size controlled!")

Adam step for Stable Neuron: 0.000996
Adam step for Shaky Neuron:  0.000204

Observation: Even though the shaky gradients were 100x larger,
Adam kept the step size controlled!
