#### import needed libraries

In [2]:
import numpy as np

#### problems
assume big_h or small_h are the outputs of our neural network (logits).
Simply applying the exponential function to these values, leads to problems. For the big numbers, these are directly visible in form of overflow. The very small values might lead to potential division by zero values in further computations.

In [10]:
big_h = [42, 1337, 557]
softmax_nominator = np.exp(big_h)
softmax_nominator

  softmax_nominator = np.exp(big_h)


array([1.73927494e+018,             inf, 7.98043234e+241])

In [11]:
small_h = [-42, -1337, -557]
softmax_nominator = np.exp(small_h)
softmax_nominator

array([5.74952226e-019, 0.00000000e+000, 1.25306494e-242])

#### workaround
again assume we put in the numbers in `hypothesis` into our softmax (apply the exponential function to them, among other things). But this time we subtract each value in `hypothesis` by the biggest value. As you can see, previously unprocessable values can thus be handled with ease.

In [6]:
hypothesis = [1335, 1337, 1322]
safe_hypothesis = hypothesis - np.max(hypothesis)
safe_hypothesis

array([ -2,   0, -15])

In [7]:
nominator = np.exp(safe_hypothesis)
nominator

array([1.35335283e-01, 1.00000000e+00, 3.05902321e-07])

In [8]:
denominator = np.sum(nominator)
denominator

1.1353355891389334

In [9]:
result = nominator / denominator
result

array([1.19202890e-01, 8.80796841e-01, 2.69437797e-07])

#### safe softmax as a function

In [None]:
def softmax(z: np.ndarray, gradient: bool = False) -> np.ndarray:
    """
    Apply the softmax transformation to the given vector (z).
    If the provided gradient flag is true, return the derivative of the softmax.
    :param z: Data vector of shape (C X 1)
    :param gradient: Boolean flag. Default is False.
    :return: Softmax transformed values or the gradient of the softmax.
    """
    C, _ = z.shape
    
    z -= np.max(z, axis=1, keepdims=True)  # for numeric stability
    S = np.exp(z) / np.exp(z).sum(axis=1, keepdims=True)
    return S

In [18]:
hypothesis = np.array([[2, 3.1, -1, 0, 1.7]])
softmax(hypothesis)

array([[0.20283537, 0.60935113, 0.01009858, 0.02745078, 0.15026414]])