# SGD

The implementation of the SGD algorithm.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

def sgd_optimizer(loss, grad_loss, w0, D, alpha, batch_size, n_epochs):
    X, y = D  # Unpack the data
    N = X.shape[0]
    d = w0.shape[0]
    idx = np.arange(0, N)
    
    # Initialization of history vectors
    w_history = np.zeros((n_epochs, d))  # Save weights at each iteration
    loss_history = np.zeros((n_epochs, ))  # Save loss values at each iteration
    grad_norm_history = np.zeros((n_epochs, ))  # Save gradient norms at each iteration
    
    # Initialize weights
    w = w0  
    for epoch in range(n_epochs):
        # Shuffle the data at the beginning of each epoch
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        # Initialize a vector that saves the gradient of the loss at each iteration
        grad_loss_vec = []

        for batch_start in range(0, N, batch_size):
            batch_end = min(batch_start + batch_size, N)
            X_batch = X[batch_start:batch_end]
            y_batch = y[batch_start:batch_end]
            
            # Compute the gradient of the loss
            gradient = grad_loss(w, X_batch, y_batch)
            grad_loss_vec.append(np.linalg.norm(gradient, 2))

            # Update weights
            w = w - alpha*  gradient

        # Save the updated values
        w_history[epoch] = w
        loss_history[epoch] = loss(w, X, y)
        grad_norm_history[epoch] = np.mean(grad_loss_vec)
    
    return w_history, loss_history, grad_norm_history