In [47]:
import numpy as np
# softmax stable function


def stable_softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)


def softmax_loss(X, y, W):
    A = stable_softmax(X.dot(W))
    idx = range(A.shape[0])
    return -np.mean(np.log(A[idx, y]))+0.5*np.sum(W**2)*1/A.shape[0]


def softmax_grad(X, y, W):
    """
    W: 2d numpy array of shape (d, C),
    each column correspoding to one output node
    X: 2d numpy array of shape (N, d), each row is one data point
    y: 1d numpy array -- label of each row of X
    """
    A = stable_softmax(X.dot(W))  # shape of (N, C)
    id0 = range(X.shape[0])
    A[id0, y] -= 1  # A - Y, shape of (N, C)
    return X.T.dot(A)/X.shape[0]


# test
X = np.array([[3, 3, 5], [1, 2, -1], [3, 3, 5], [1, 2, -1]])
y = np.array([0, 2, 1, 1])
W = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
print(softmax_loss(X, y, W))
A = stable_softmax(X.dot(W))
print(np.argmax(A, axis=1))
print(softmax_grad(X, y, W))


2.419278491495035
[2 1 2 1]
[[-0.4604913  -0.48754728  0.94803858]
 [-0.33074307 -0.38485502  0.71559809]
 [-1.11348078 -1.08642481  2.19990559]]
