In [1]:
import numpy as np
from typing import Tuple

def softmax_loss_log(W: np.ndarray, X: np.ndarray, y: np.ndarray) -> Tuple[float, np.ndarray]:
    """
    A loss function consisting of a softmax layer that is fed into the log-loss.
    Loss as well as the weight derivatives are returned.
    
    :param W: Weight matrix of shape (D x C)
    :param X: Data matrix of shape (N X D)
    :param y: Vector of labels (N x 1)
    :return: Tuple of loss value and weight derivatives of shape (D x C).
    """
    # Gather relevant sizes
    N, _ = X.shape
    
    # Compute raw predictions. 
    # These values can range anywhere between (-inf, inf)
    y_prediction = X @ W
    
    # Compute the softmax (predictions are now balanced and in range (0, 1))
    y_prediction -= np.max(y_prediction, axis=1, keepdims=True)  # for numerical stability
    S_nominator = np.exp(y_prediction)
    S_denominator = np.exp(y_prediction).sum(axis=1, keepdims=True)
    S = S_nominator / S_denominator
    
    # Compute the overall log loss
    loss = -(1/N) * np.sum(
        np.log(
            S[np.arange(N), y]
        )
    )

    # substract 1 from all predictions of ground truth class.
    # this makes the following computation of dW uniform with all non-ground truth classes.
    # (e.g. -X*(1-(S-1)) = X*S)
    S[np.arange(N), y] -= 1 
    
    # compute the weight derivatives. 
    dW = (X.T @ S) / N  # divide by N because we compute via (D x N) * (N x C)

    return loss, dW
